def train(config, model_data, data, record): model_class_name, percentile = model_data model = instantiate_from_class_string(model_class_name) try: model.n_jobs = config['n_jobs'] except: log.info('Cannot set n_jobs for this model...') record['model'] = model_name(model) record['parameters'] = model.get_params() record['feats_percentile'] = percentile train_x = data['train_x'] train_y = data['train_y'] test_x = data['test_x'] # estimate accuracy using cross-validation model = make_pipeline(SelectPercentile(f_classif, percentile), StandardScaler(), model) scores = cross_validation.cross_val_score(model, train_x, train_y, cv=5, scoring='accuracy') record['mean_acc'] = scores.mean() # predict on the test set fn = SelectPercentile(f_classif, percentile).fit(train_x, train_y) train_x = fn.transform(train_x) test_x = fn.transform(test_x) scaler = StandardScaler().fit(train_x) train_x = scaler.transform(train_x) test_x = scaler.transform(test_x) model.fit(train_x, train_y) ids = data['test_ids'] preds = model.predict(test_x) record['test_preds'] = [(id_, pred) for id_, pred in zip(ids, preds)]
from sklearn.datasets import load_iris from sklearn.metrics import precision_score from sklearn.cross_validation import train_test_split classes = ["sklearn.ensemble.RandomForestClassifier"] models = grid_generator.grid_from_classes(classes) iris = load_iris() X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.30) # create a new experiment ex = Experiment(main["logger"]) for m in models: # create a new record rec = ex.record() m.fit(X_train, y_train) preds = m.predict(X_test) rec["precision"] = precision_score(y_test, preds) rec["parameters"] = m.get_params() rec["model"] = model_name(m) # select top_k ex.records = top_k(ex.records, "precision", 2) # store records in the database ex.save()