def fit_rf(path, index_filter=None, class_filter=None, feature_filter=None, folds=10, inverse=False, lc_filter=None): """ path: Dirección del dataset a ocupar para entrenar index_filter: Pandas index para filtrar las filas del dataset que se quieren utilizar class_filter: Lista de clases que se quiere utilizar feature_filter: Lista de features que se quiere utilizar """ data = pd.read_csv(path, index_col=0) data, y = utils.filter_data(data, index_filter, class_filter, feature_filter, lc_filter) skf = cross_validation.StratifiedKFold(y, n_folds=folds) results = [] for train_index, test_index in skf: if inverse: aux = train_index train_index = test_index test_index = aux train_X, test_X = data.iloc[train_index], data.iloc[test_index] train_y, test_y = y.iloc[train_index], y.iloc[test_index] clf = None clf = RandomForestClassifier(n_estimators=100, criterion='entropy', max_depth=14, min_samples_split=5) clf.fit(train_X, train_y) results.append(metrics.predict_table(clf, test_X, test_y)) return pd.concat(results)
def fit_sktree(path, index_filter=None, class_filter=None, feature_filter=None, folds=10, inverse=False, max_depth=10, min_samples_split=20, lc_filter=None): data = pd.read_csv(path, index_col=0) data, y = utils.filter_data(data, index_filter, class_filter, feature_filter, lc_filter) skf = cross_validation.StratifiedKFold(y, n_folds=folds) results = [] for train_index, test_index in skf: if inverse: aux = train_index train_index = test_index test_index = aux train_X, test_X = data.iloc[train_index], data.iloc[test_index] train_y, test_y = y.iloc[train_index], y.iloc[test_index] clf = None clf = DecisionTreeClassifier(criterion='entropy', max_depth=max_depth, min_samples_split=min_samples_split) clf.fit(train_X, train_y) results.append(metrics.predict_table(clf, test_X, test_y)) return pd.concat(results)
y = data['class'] data = data.drop('class', axis=1) results = [] skf = cross_validation.StratifiedKFold(y, n_folds=folds) for train_index, test_index in skf: train_X, test_X = data.iloc[train_index], data.iloc[test_index] train_y, test_y = y.iloc[train_index], y.iloc[test_index] clf = None clf = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split, n_jobs=n_processes) clf.fit(train_X, train_y) results.append(metrics.predict_table(clf, test_X, test_y)) result = pd.concat(results) output = open(result_path + 'Arboles/Arbol.pkl', 'wb+') pickle.dump(clf, output) output.close() result.to_csv(result_path + 'Predicciones/result.csv') matrix = metrics.confusion_matrix(result) matrix.to_csv(result_path + 'Metricas/soft_matrix_.csv') clases = matrix.columns.tolist() f_score = [metrics.f_score(matrix, c) for c in clases]
def fit_means_rf(train_path, test_path, index_filter=None, class_filter=None, feature_filter=None, folds=10): """ train_path: Dirección del dataset a ocupar para entrenar test_path: Dirección del dataset a ocupar para testear index_filter: Pandas index para filtrar las filas del dataset que se quieren utilizar class_filter: Lista de clases que se quiere utilizar feature_filter: Lista de features que se quiere utilizar """ train_data = pd.read_csv(train_path, index_col=0) test_data = pd.read_csv(test_path, index_col=0) # Elimino curvas que estan repetidas test_data = utils.remove_duplicate_index(test_data) train_data = utils.remove_duplicate_index(train_data) if index_filter: train_data = train_data.loc[index_filter] test_data = test_data.loc[index_filter] if class_filter: train_data = train_data[train_data['class'].apply(lambda x: True if x in class_filter else False)] test_data = test_data[test_data['class'].apply(lambda x: True if x in class_filter else False)] train_data = train_data.dropna(axis=0, how='any') test_data = test_data.dropna(axis=0, how='any') # Me aseguro que los datasets sean de los mismos datos common_index = list(set(test_data.index.tolist()) & set(train_data.index.tolist())) test_data = test_data.loc[common_index] train_data = train_data.loc[common_index] train_data = train_data.sort_index() test_data = test_data.sort_index() # Separo features de las clases train_y = train_data['class'] train_X = train_data.drop('class', axis=1) test_y = test_data['class'] test_X = test_data.drop('class', axis=1) if feature_filter: train_X = train_X[feature_filter] test_X = test_X[feature_filter] skf = cross_validation.StratifiedKFold(train_y, n_folds=folds) results = [] for train_index, test_index in skf: fold_train_X = train_X.iloc[train_index] fold_train_y = train_y.iloc[train_index] fold_test_X = test_X.iloc[test_index] fold_test_y = test_y.iloc[test_index] clf = None clf = RandomForestClassifier(n_estimators=100, criterion='entropy', max_depth=14, min_samples_split=5) clf.fit(fold_train_X, fold_train_y) results.append(metrics.predict_table(clf, fold_test_X, fold_test_y)) return pd.concat(results)
feature_filter = args.feature_filter train_data = pd.read_csv(training_set_path, index_col=0) train_X, train_y = utils.filter_data(train_data, feature_filter=feature_filter) test_data = pd.read_csv(test_set_path, index_col=0) test_X, test_y = utils.filter_data(test_data, feature_filter=feature_filter) clf = None clf = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split, n_jobs=n_processes) clf.fit(train_X, train_y) result = metrics.predict_table(clf, test_X, test_y) result['indice'] = test_X.index.tolist() result.set_index('indice') result.index.name = catalog + '_id' result = result.drop('indice', axis=1) output = open(result_path + 'Arboles/Arbol_' + percentage + '.pkl', 'wb+') pickle.dump(clf, output) output.close() result.to_csv(result_path + 'Predicciones/result_' + percentage + '.csv') print metrics.weighted_f_score(metrics.confusion_matrix(result))
train_X, train_y = data.iloc[train_index], y.iloc[train_index] clf = None clf = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split, n_jobs=n_processes) clf.fit(train_X, train_y) aux = [] for path in paths: test_data = pd.read_csv(path, index_col=0) test_data = test_data.loc[data.index].sort_index() test_data, test_y = utils.filter_data( test_data, feature_filter=feature_filter) test_X, test_y = test_data.iloc[train_index], test_y.iloc[ train_index] aux.append(metrics.predict_table(clf, test_X, test_y)) results.append(metrics.aggregate_predictions(aux)) ids.extend(test_X.index.tolist()) result = pd.concat(results) output = open(result_path + 'Arboles/Arbol_' + percentage + '.pkl', 'wb+') pickle.dump(clf, output) output.close() result.to_csv(result_path + 'Predicciones/result_' + percentage + '.csv')
for train_index, test_index in skf: if inverse: aux = train_index train_index = test_index test_index = aux train_X, test_X = data.iloc[train_index], data.iloc[test_index] train_y, test_y = y.iloc[train_index], y.iloc[test_index] clf = None clf = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split, n_jobs=n_processes) clf.fit(train_X, train_y) results.append(metrics.predict_table(clf, test_X, test_y)) ids.extend(test_X.index.tolist()) if validation == 'holdout': aux = metrics.predict_table(clf, test_X, test_y) aux.to_csv(result_path + 'Predicciones/hold_' + str(count) + '.csv') print 'hold ' + str(count) + ' ' + str(metrics.weighted_f_score(metrics.confusion_matrix(aux))) count += 1 result = pd.concat(results) result['indice'] = ids result.set_index('indice') result.index.name = catalog + '_id' result = result.drop('indice', axis=1) output = open(result_path + 'Arboles/Arbol_' + percentage + '.pkl', 'wb+')
test_index_filter = pd.read_csv(test_index_filter, index_col=0).index paths = [ sets_path + catalog + '_sampled_' + str(i) + '.csv' for i in xrange(n_samples) ] resultados = [] for p in paths: data = pd.read_csv(p, index_col=0) train_X, train_y = utils.filter_data(data, index_filter=train_index_filter, feature_filter=feature_filter) test_X, test_y = utils.filter_data(data, index_filter=test_index_filter, feature_filter=feature_filter) clf = None clf = DecisionTreeClassifier(criterion='entropy', max_depth=max_depth, min_samples_split=min_samples_split) clf.fit(train_X, train_y) resultados.append(metrics.predict_table(clf, test_X, test_y)) result = metrics.aggregate_predictions(resultados) result.to_csv(result_path + 'result_' + percentage + '.csv') print metrics.weighted_f_score(metrics.confusion_matrix(result))
for train_index, test_index in skf: fold_train_X = train_X.iloc[train_index] fold_train_y = train_y.iloc[train_index] fold_test_X = test_X.iloc[test_index] fold_test_y = test_y.iloc[test_index] clf = None clf = RandomForestClassifier(n_estimators=100, criterion='entropy', max_depth=14, min_samples_split=5, n_jobs=-1) clf.fit(fold_train_X, fold_train_y) results.append(metrics.predict_table(clf, fold_test_X, fold_test_y)) ids.extend(fold_test_X.index.tolist()) result = pd.concat(results) result['indice'] = ids result.set_index('indice') result.index.name = catalog + '_id' result = result.drop('indice', axis=1) output = open(result_path + 'Arboles/Arbol_' + percentage + '.pkl', 'wb+') pickle.dump(clf, output) output.close() result.to_csv(result_path + 'Predicciones/result_' + percentage + '.csv')
train_data = pd.read_csv(training_set_path, index_col=0) train_X, train_y = utils.filter_data(train_data, feature_filter=feature_filter) test_data = pd.read_csv(test_set_path, index_col=0) test_X, test_y = utils.filter_data(test_data, feature_filter=feature_filter) clf = None clf = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split, n_jobs=n_processes) clf.fit(train_X, train_y) result = metrics.predict_table(clf, test_X, test_y) result['indice'] = test_X.index.tolist() result.set_index('indice') result.index.name = catalog + '_id' result = result.drop('indice', axis=1) output = open(result_path + 'Arboles/Arbol_' + percentage + '.pkl', 'wb+') pickle.dump(clf, output) output.close() result.to_csv(result_path + 'Predicciones/result_' + percentage + '.csv') print metrics.weighted_f_score(metrics.confusion_matrix(result))