index_filter = args.index_filter feature_filter = args.feature_filter if index_filter is not None: index_filter = pd.read_csv(index_filter, index_col=0).index paths = [sets_path + catalog + '_sampled_' + str(i) + '.csv' for i in xrange(n_samples)] if model == 'tree': partial_fit = partial(parallel.fit_tree, feature_filter=feature_filter, folds=folds, inverse=inverse, max_depth=max_depth, min_samples_split=min_samples_split, lc_filter=lc_filter) elif model == 'rf': partial_fit = partial(parallel.fit_rf, feature_filter=feature_filter, folds=folds, inverse=inverse, lc_filter=lc_filter) elif model == 'sktree': partial_fit = partial(parallel.fit_sktree, feature_filter=feature_filter, folds=folds, inverse=inverse, max_depth=max_depth, min_samples_split=min_samples_split, lc_filter=lc_filter) pool = Pool(processes=n_processes, maxtasksperchild=2) resultados = pool.map(partial_fit, paths) pool.close() pool.join() result = metrics.aggregate_predictions(resultados) result.to_csv(result_path) print metrics.weighted_f_score(metrics.confusion_matrix(result))
train_X, train_y = data.iloc[train_index], y.iloc[train_index] clf = None clf = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split, n_jobs=n_processes) clf.fit(train_X, train_y) aux = [] for path in paths: test_data = pd.read_csv(path, index_col=0) test_data = test_data.loc[data.index].sort_index() test_data, test_y = utils.filter_data( test_data, feature_filter=feature_filter) test_X, test_y = test_data.iloc[train_index], test_y.iloc[ train_index] aux.append(metrics.predict_table(clf, test_X, test_y)) results.append(metrics.aggregate_predictions(aux)) ids.extend(test_X.index.tolist()) result = pd.concat(results) output = open(result_path + 'Arboles/Arbol_' + percentage + '.pkl', 'wb+') pickle.dump(clf, output) output.close() result.to_csv(result_path + 'Predicciones/result_' + percentage + '.csv')
ids = [] for train_index, test_index in skf: train_X, train_y = data.iloc[train_index], y.iloc[train_index] clf = None clf = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split, n_jobs=n_processes) clf.fit(train_X, train_y) aux = [] for path in paths: test_data = pd.read_csv(path, index_col=0) test_data = test_data.loc[data.index].sort_index() test_data, test_y = utils.filter_data(test_data, feature_filter=feature_filter) test_X, test_y = test_data.iloc[train_index], test_y.iloc[train_index] aux.append(metrics.predict_table(clf, test_X, test_y)) results.append(metrics.aggregate_predictions(aux)) ids.extend(test_X.index.tolist()) result = pd.concat(results) output = open(result_path + 'Arboles/Arbol_' + percentage + '.pkl', 'wb+') pickle.dump(clf, output) output.close() result.to_csv(result_path + 'Predicciones/result_' + percentage + '.csv')