Beispiel #1
0
    # setting up GPU environment
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICES"] = "1"  # set GPU 0

    # parser = argparse.ArgumentParser(description='Utility to build Classic ML models')
    # parser.add_argument('-i', '--infile', required=True)
    # parser.add_argument('-nl', '--n_layer', required=True)
    # args = parser.parse_args()

    dataset = 'trainingset'
    features = 'ECFP6'
    n_layer = 3
    n_split = 5

    X_train, y_train = make_dataset('{}.sdf'.format(dataset), features)

    print("Training set includes {} descriptors".format(X_train.shape[1]))
    print("Training set includes {}, {} molecules".format(
        X_train.shape[0], y_train.shape[0]))

    nodes = X_train.shape[1]
    num_hidden = [nodes for _ in range(n_layer)]
    num_steps = 10001

    out_batch = BatchLogger(display=5)
    reduce_lr = ReduceLROnPlateau(monitor='loss',
                                  factor=0.9,
                                  patience=50,
                                  min_lr=0.00001,
                                  verbose=1)
seed = 0
env_var = args.env_var
data_dir = os.getenv(env_var)
name_col = args.name_col
endpoint = args.endpoint
threshold = args.threshold
test_set_size = args.test_set_size

# Check to see if necessary directories are present and if not, create them
directory_check(data_dir)

# get training data and split in training, test
# and use a seed for reproducibility
X, y = make_dataset(f'{dataset}.sdf',
                    data_dir=env_var,
                    features=features,
                    name_col=name_col,
                    endpoint=endpoint,
                    threshold=threshold)
X_train, y_train_class, X_test, y_test_class = split_train_test(
    X, y, n_splits, test_set_size, seed, None)

if len(pd.unique(X.values.ravel('K'))) > 2:
    CLASSIFIER_ALGORITHMS.pop(4)

else:
    CLASSIFIER_ALGORITHMS.pop(1)

cv = model_selection.StratifiedKFold(shuffle=True,
                                     n_splits=n_splits,
                                     random_state=seed)
threshold = args.threshold
evaluate = args.evaluate
train_name = args.train_name

for alg in ['ada', 'bnb', 'knn', 'nb', 'rf', 'svc']:
    for feature in features:
        for endpoint in endpoints:
            model_name = f'{alg}_{train_name}_{feature}_{endpoint}_{threshold}_pipeline'
            model_file_path = os.path.join(data_dir, 'ML_models',
                                           f'{model_name}.pkl')

            if os.path.exists(model_file_path):
                if evaluate in ['Y', 'y']:
                    X_pred_set, y = make_dataset(f'{prediction_set}.sdf',
                                                 data_dir=env_var,
                                                 features=feature,
                                                 name_col=name_col,
                                                 endpoint=endpoint,
                                                 threshold=threshold)

                else:
                    X_pred_set = make_dataset(f'{prediction_set}.sdf',
                                              data_dir=env_var,
                                              features=features,
                                              name_col=name_col,
                                              endpoint=None,
                                              threshold=None,
                                              pred_set=True)

                loaded_model = load(model_file_path)
                predictions = loaded_model.predict(X_pred_set)
                probabilities = loaded_model.predict_proba(X_pred_set)
args = parser.parse_args()
data_dir = os.getenv(args.env_var)
env_var = args.env_var
features = args.features
name_col = args.name_col
prediction_set = args.prediction_set
endpoint = args.endpoint
threshold = args.threshold
train_name = args.train_name
algorithms = args.algorithms.lower().split(',')

X_pred = make_dataset(f'{prediction_set}.sdf',
                      data_dir=env_var,
                      features=features,
                      name_col=name_col,
                      endpoint=endpoint,
                      threshold=threshold,
                      pred_set=True)
algorithms = [alg for alg in algorithms]
preds = []

if len(algorithms) < 1:
    raise Exception(
        'Please enter at least one algorithm with which to make predictions.')

for alg in algorithms:
    model_name = f'{alg}_{train_name}_{features}_{endpoint}_{threshold}_pipeline'
    model_file_path = os.path.join(data_dir, 'models', f'{model_name}.pkl')

    if os.path.exists(model_file_path):
Beispiel #5
0
seed = 0
env_var = args.data_dir
data_dir = os.getenv(env_var)
name_col = args.name_col
endpoint = args.endpoint
test_set_size = args.test_set_size

# Check to see if necessary directories are present and if not, create them
directory_check(data_dir)

# get training data and split in training, test
# and use a seed for reproducibility
X, y_regress, y_class = make_dataset('{}.sdf'.format(dataset),
                                     data_dir=env_var,
                                     features=features,
                                     name_col=name_col,
                                     endpoint=endpoint,
                                     regress=True,
                                     threshold=2000)

X_train, y_train_regress, X_test, y_test_regress = split_train_test(X, y_class, n_splits, test_set_size, seed, None)

if test_set_size != 0:
    y_train_regress = y_regress.loc[X_train.index].values.ravel()
    y_test_regress = y_regress.loc[X_test.index].values.ravel()

y_train_regress = y_regress.loc[X_train.index]

cv = model_selection.KFold(shuffle=True, n_splits=n_splits, random_state=seed)

for name, clf, params in REGRESSOR_ALGS:
Beispiel #6
0
seed = 0
env_var = args.data_dir
data_dir = os.getenv(env_var)
name_col = args.name_col
endpoint = args.endpoint
threshold = args.threshold
test_set_size = args.test_set_size

# Check to see if necessary directories are present and if not, create them
directory_check(data_dir)

# get training data and split in training, test
# and use a seed for reproducibility
X, y = make_dataset('{}.sdf'.format(dataset),
                    data_dir=env_var,
                    features=features,
                    name_col=name_col,
                    endpoint=endpoint,
                    threshold=threshold)
X_train, y_train_class, X_test, y_test_class = split_train_test(
    X, y, n_splits, test_set_size, seed, None)

cv = model_selection.StratifiedKFold(shuffle=True,
                                     n_splits=n_splits,
                                     random_state=seed)

print("Num Actives: {}".format((y == 1).sum()))
print("Num Inactives: {}".format((y == 0).sum()))

for name, clf, params in CLASSIFIER_ALGS:
    pipe = pipeline.Pipeline([('scaler', StandardScaler()), (name, clf)])
    grid_search = model_selection.GridSearchCV(pipe,