criterion = args.criterion max_depth = args.max_depth min_samples_split = args.min_samples_split result_path = args.result_path feature_filter = args.feature_filter data = pd.read_csv(training_set_path, index_col=0) paths = [ test_path + catalog + '_sampled_' + str(i) + '.csv' for i in xrange(100) ] # Necesito asegurarme de que las curvas sean las mismas en train y test test_data = pd.read_csv(paths[0], index_col=0) data, test_data = utils.equalize_indexes(data, test_data) data, y = utils.filter_data(data, feature_filter=feature_filter) skf = cross_validation.StratifiedKFold(y, n_folds=folds) results = [] ids = [] for train_index, test_index in skf: train_X, train_y = data.iloc[train_index], y.iloc[train_index] clf = None clf = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth,
# coding=utf-8 # Script solo pa recordar como separe los sets de testing para eros y macho. No esta # pensado para correrse normalmente # ----------------------------------------------------------------------------- from sklearn import cross_validation import pandas as pd import utils catalog = 'EROS' test_size = 5000 normal_df = pd.read_csv('/n/home09/ncastro/workspace/Features/sets/' + catalog + '/' + catalog + '_regular_set_5.csv', index_col=0) sample_df = pd.read_csv('/n/seasfs03/IACS/TSC/ncastro/sets/' + catalog + '_Sampled/uniform/5%/' + catalog + '_sampled_0.csv', index_col=0) a, b = utils.equalize_indexes(normal_df, sample_df) sss = cross_validation.StratifiedShuffleSplit(a['class'], n_iter=1, test_size=test_size, random_state=1) for train_index, test_index in sss: train_df = a.iloc[train_index] test_df = a.iloc[test_index] a.to_csv('/n/home09/ncastro/workspace/Features/sets/Common/' + catalog + '.csv') test_df.to_csv('/n/home09/ncastro/workspace/Features/sets/Common/' + catalog + '_test.csv')
n_estimators = args.n_estimators criterion = args.criterion max_depth = args.max_depth min_samples_split = args.min_samples_split feature_filter = args.feature_filter index_filter = args.index_filter if index_filter is not None: index_filter = pd.read_csv(index_filter, index_col=0).index train_data = pd.read_csv(train_path, index_col=0) test_data = pd.read_csv(test_path, index_col=0) train_data, test_data = utils.equalize_indexes(train_data, test_data) train_X, train_y = utils.filter_data(train_data, index_filter=index_filter, feature_filter=feature_filter) test_X, test_y = utils.filter_data(test_data, index_filter=index_filter, feature_filter=feature_filter) # Ocupo solo los datos de test para hacer el k-fold, por que estos no estan repetidos # Y es valido ocuparlos solo por posicion skf = cross_validation.StratifiedKFold(test_y, n_folds=folds) results = [] ids = [] for train_index, test_index in skf: if inverse: aux = train_index train_index = test_index test_index = aux
n_estimators = args.n_estimators criterion = args.criterion max_depth = args.max_depth min_samples_split = args.min_samples_split feature_filter = args.feature_filter index_filter = args.index_filter if index_filter is not None: index_filter = pd.read_csv(index_filter, index_col=0).index train_data = pd.read_csv(train_path, index_col=0) test_data = pd.read_csv(test_path, index_col=0) train_data, test_data = utils.equalize_indexes(train_data, test_data) train_X, train_y = utils.filter_data(train_data, index_filter=index_filter, feature_filter=feature_filter) test_X, test_y = utils.filter_data(test_data, index_filter=index_filter, feature_filter=feature_filter) # Ocupo solo los datos de test para hacer el k-fold, por que estos no estan repetidos # Y es valido ocuparlos solo por posicion skf = cross_validation.StratifiedKFold(test_y, n_folds=folds) results = [] ids = [] for train_index, test_index in skf:
n_estimators = args.n_estimators criterion = args.criterion max_depth = args.max_depth min_samples_split = args.min_samples_split result_path = args.result_path feature_filter = args.feature_filter data = pd.read_csv(training_set_path, index_col=0) paths = [test_path + catalog + '_sampled_' + str(i) + '.csv' for i in xrange(100)] # Necesito asegurarme de que las curvas sean las mismas en train y test test_data = pd.read_csv(paths[0], index_col=0) data, test_data = utils.equalize_indexes(data, test_data) data, y = utils.filter_data(data, feature_filter=feature_filter) skf = cross_validation.StratifiedKFold(y, n_folds=folds) results = [] ids = [] for train_index, test_index in skf: train_X, train_y = data.iloc[train_index], y.iloc[train_index] clf = None clf = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split, n_jobs=n_processes)