from sklearn.cross_validation import train_test_split from sklearn.datasets.mldata import fetch_mldata from RuleListClassifier import * from sklearn.ensemble import RandomForestClassifier feature_labels = ["#Pregnant","Glucose concentration test","Blood pressure(mmHg)","Triceps skin fold thickness(mm)","2-Hour serum insulin (mu U/ml)","Body mass index","Diabetes pedigree function","Age (years)"] data = fetch_mldata("diabetes") # get dataset y = (data.target+1)/2 # target labels (0 or 1) Xtrain, Xtest, ytrain, ytest = train_test_split(data.data, y) # split # train classifier (allow more iterations for better accuracy) clf = RuleListClassifier(max_iter=10000, class1label="diabetes", verbose=False) clf.fit(Xtrain, ytrain, feature_labels=feature_labels) print "RuleListClassifier Accuracy:", clf.score(Xtest, ytest), "Learned interpretable model:\n", clf print "RandomForestClassifier Accuracy:", RandomForestClassifier().fit(Xtrain, ytrain).score(Xtest, ytest)
# # y[y < 0] = 0 # Xtrain, Xtest, ytrain, ytest = train_test_split(data_array, target_array) # # print Xtrain.shape # print Xtest.shape # # print Xtrain.shape # new_data print len(data_labels_named) clf = RuleListClassifier(max_iter=50000, n_chains=3, class1label='Performs well', listlengthprior=6, listwidthprior=3, verbose=True) clf.fit( Xtrain, ytrain, feature_labels=data_labels_named, # feature_labels=data_labels, undiscretized_features=[ 'School name', 'Gender', 'Age', 'Locality', 'Family size', 'Parents marital status', 'Mothers education', 'Fathers education', 'Mothers job', 'Fathers job', 'Reason to join school', 'Legal guardian', 'Additional school support', 'Additional family support', 'Extra tutoring', 'Extracurricular activities', 'Attended nursery', 'Planning higher education', 'Access to internet', 'Romantic status',
feature_data = data[features] target_data = data[class1label] # raw_input() target_data[target_data > 1] = 1 # target_data = target_data.map({'L': 0, 'H': 1}) print target_data.head(100) target_data = np.asarray(target_data) print 'Size of dataset:', target_data.shape raw_input('Press enter to continue...') Xtrain, Xtest, ytrain, ytest = train_test_split(feature_data, target_data) # split # train classifier (allow more iterations for better accuracy; use BigDataRuleListClassifier for large datasets) model = RuleListClassifier(max_iter=10000, class1label='performs well', verbose=False) svm_model = svm.SVC(kernel='linear') svm_model_2 = svm.SVC(kernel='rbf') rf_model = RandomForestClassifier() print '*' * 80 # print "RuleListClassifier Accuracy:", model.score(Xtest, ytest), "Learned interpretable model:\n", model # print "RandomForestClassifier Accuracy:", RandomForestClassifier().fit(Xtrain, ytrain).score(Xtest, ytest) num_partitions = 20 scores = cross_val_score(model, feature_data, target_data, cv=num_partitions) print 'BRL accuracy' print scores print 'BRL average accuracy'
from sklearn.cross_validation import train_test_split from sklearn.datasets.mldata import fetch_mldata from RuleListClassifier import * from sklearn.ensemble import RandomForestClassifier feature_labels = [ "#Pregnant", "Glucose concentration test", "Blood pressure(mmHg)", "Triceps skin fold thickness(mm)", "2-Hour serum insulin (mu U/ml)", "Body mass index", "Diabetes pedigree function", "Age (years)" ] data = fetch_mldata("diabetes") # get dataset y = -( data.target - 1 ) / 2 # target labels (0: healthy, or 1: diabetes) - the original dataset contains -1 for diabetes and +1 for healthy ############################################################################### Xtrain, Xtest, ytrain, ytest = train_test_split(data.data, y) # split # train classifier (allow more iterations for better accuracy) clf = RuleListClassifier(max_iter=10000, class1label="diabetes", verbose=False) clf.fit(Xtrain, ytrain, feature_labels=feature_labels) print "RuleListClassifier Accuracy:", clf.score( Xtest, ytest), "Learned interpretable model:\n", clf ############################################################################### print "RandomForestClassifier Accuracy:", RandomForestClassifier().fit( Xtrain, ytrain).score(Xtest, ytest)
# convert to dataframe hepatitis_df = pd.DataFrame(columns) # deal with missing values for c in hepatitis_df.columns: if hepatitis_df[c].dtype != np.object: hepatitis_df[c] = hepatitis_df[c].fillna( hepatitis_df[c][~np.isnan(hepatitis_df[c])].mean()) print hepatitis_df.head() ############################################################################### Xtrain, Xtest, ytrain, ytest = train_test_split(hepatitis_df, y) # split # train classifier (allow more iterations for better accuracy) clf = RuleListClassifier(max_iter=10000, class1label="survival", verbose=False) clf.fit(Xtrain, ytrain) print "RuleListClassifier Accuracy:", clf.score( Xtest, ytest), "Learned interpretable model:\n", clf ############################################################################### try: from category_encoders import HashingEncoder except: raise Exception( "Please install category_encoders (pip install category_encoders) for comparing mixed data with Random Forests!" ) from sklearn import pipeline
from RuleListClassifier import * import sklearn.ensemble from sklearn.cross_validation import train_test_split from sklearn.datasets.mldata import fetch_mldata dataseturls = ["https://archive.ics.uci.edu/ml/datasets/Iris", "https://archive.ics.uci.edu/ml/datasets/Pima+Indians+Diabetes"] datasets = ["iris", "diabetes"] data_feature_labels = [ ["Sepal length", "Sepal width", "Petal length", "Petal width"], ["#Pregnant","Glucose concentration demo","Blood pressure(mmHg)","Triceps skin fold thickness(mm)","2-Hour serum insulin (mu U/ml)","Body mass index","Diabetes pedigree function","Age (years)"] ] data_class1_labels = ["Iris Versicolour", "No Diabetes"] for i in range(len(datasets)): print "--------" print "DATASET: ", datasets[i], "(", dataseturls[i], ")" data = fetch_mldata(datasets[i]) y = data.target y[y>1] = 0 y[y<0] = 0 Xtrain, Xtest, ytrain, ytest = train_test_split(data.data, y) clf = RuleListClassifier(max_iter=50000, n_chains=3, class1label=data_class1_labels[i], verbose=False) clf.fit(Xtrain, ytrain, feature_labels=data_feature_labels[i]) print "accuracy:", clf.score(Xtest, ytest) print "rules:\n", clf print "Random Forest accuracy:", sklearn.ensemble.RandomForestClassifier().fit(Xtrain, ytrain).score(Xtest, ytest)
] datasets = ["iris", "diabetes"] data_feature_labels = [ ["Sepal length", "Sepal width", "Petal length", "Petal width"], [ "#Pregnant", "Glucose concentration demo", "Blood pressure(mmHg)", "Triceps skin fold thickness(mm)", "2-Hour serum insulin (mu U/ml)", "Body mass index", "Diabetes pedigree function", "Age (years)" ] ] data_class0_labels = ["Iris Versicolour", "Diabetes"] for i in range(len(datasets)): print "--------" print "DATASET: ", datasets[i], "(", dataseturls[i], ")" data = fetch_mldata(datasets[i]) y = data.target y[y > 1] = 0 y[y < 0] = 0 Xtrain, Xtest, ytrain, ytest = train_test_split(data.data, y) clf = RuleListClassifier(max_iter=50000, n_chains=3, class0label=data_class0_labels[i], verbose=False) clf.fit(Xtrain, ytrain, feature_labels=data_feature_labels[i]) print "accuracy:", clf.score(Xtest, ytest) print "rules:\n", clf print "Random Forest accuracy:", sklearn.ensemble.RandomForestClassifier( ).fit(Xtrain, ytrain).score(Xtest, ytest)