예제 #1
0
    # feature_labels=data_labels,
    undiscretized_features=[
        'School name', 'Gender', 'Age', 'Locality', 'Family size',
        'Parents marital status', 'Mothers education', 'Fathers education',
        'Mothers job', 'Fathers job', 'Reason to join school',
        'Legal guardian', 'Additional school support',
        'Additional family support', 'Extra tutoring',
        'Extracurricular activities', 'Attended nursery',
        'Planning higher education', 'Access to internet', 'Romantic status',
        'Quality of family relationships', 'Free time post school',
        'Leisure time with friends', 'Daily Alcohol consumption',
        'Weekly alcohol consumption', 'Current health', '# of absences'
    ])

print "rules:\n", clf
print "accuracy:", clf.score(Xtest, ytest)
print "Random Forest accuracy:", sklearn.ensemble.RandomForestClassifier().fit(
    Xtrain, ytrain).score(Xtest, ytest)

# data_class1_labels = ["Iris Versicolour", "No Diabetes"]
# for i in range(len(datasets)):
#     print "--------"
#     print "DATASET: ", datasets[i], "(", dataseturls[i], ")"
#     data = fetch_mldata(datasets[i])
#     y = data.target
#     y[y > 1] = 0
#     y[y < 0] = 0
#
#     Xtrain, Xtest, ytrain, ytest = train_test_split(data.data, y)
#
#     clf = RuleListClassifier(max_iter=50000, n_chains=3, class1label=data_class1_labels[i], verbose=False)
예제 #2
0
from sklearn.cross_validation import train_test_split
from sklearn.datasets.mldata import fetch_mldata
from RuleListClassifier import *
from sklearn.ensemble import RandomForestClassifier

feature_labels = [
    "#Pregnant", "Glucose concentration test", "Blood pressure(mmHg)",
    "Triceps skin fold thickness(mm)", "2-Hour serum insulin (mu U/ml)",
    "Body mass index", "Diabetes pedigree function", "Age (years)"
]

data = fetch_mldata("diabetes")  # get dataset
y = -(
    data.target - 1
) / 2  # target labels (0: healthy, or 1: diabetes) - the original dataset contains -1 for diabetes and +1 for healthy

###############################################################################

Xtrain, Xtest, ytrain, ytest = train_test_split(data.data, y)  # split

# train classifier (allow more iterations for better accuracy)
clf = RuleListClassifier(max_iter=10000, class1label="diabetes", verbose=False)
clf.fit(Xtrain, ytrain, feature_labels=feature_labels)

print "RuleListClassifier Accuracy:", clf.score(
    Xtest, ytest), "Learned interpretable model:\n", clf

###############################################################################

print "RandomForestClassifier Accuracy:", RandomForestClassifier().fit(
    Xtrain, ytrain).score(Xtest, ytest)
예제 #3
0
from sklearn.cross_validation import train_test_split
from sklearn.datasets.mldata import fetch_mldata
from RuleListClassifier import *
from sklearn.ensemble import RandomForestClassifier

feature_labels = ["#Pregnant","Glucose concentration test","Blood pressure(mmHg)","Triceps skin fold thickness(mm)","2-Hour serum insulin (mu U/ml)","Body mass index","Diabetes pedigree function","Age (years)"]
    
data = fetch_mldata("diabetes") # get dataset
y = (data.target+1)/2 # target labels (0 or 1)
Xtrain, Xtest, ytrain, ytest = train_test_split(data.data, y) # split

# train classifier (allow more iterations for better accuracy)
clf = RuleListClassifier(max_iter=10000, class1label="diabetes", verbose=False)
clf.fit(Xtrain, ytrain, feature_labels=feature_labels)

print "RuleListClassifier Accuracy:", clf.score(Xtest, ytest), "Learned interpretable model:\n", clf
print "RandomForestClassifier Accuracy:", RandomForestClassifier().fit(Xtrain, ytrain).score(Xtest, ytest)
예제 #4
0
from RuleListClassifier import *
import sklearn.ensemble
from sklearn.cross_validation import train_test_split
from sklearn.datasets.mldata import fetch_mldata

dataseturls = ["https://archive.ics.uci.edu/ml/datasets/Iris", "https://archive.ics.uci.edu/ml/datasets/Pima+Indians+Diabetes"]
datasets = ["iris", "diabetes"]
data_feature_labels = [
    ["Sepal length", "Sepal width", "Petal length", "Petal width"],
    ["#Pregnant","Glucose concentration demo","Blood pressure(mmHg)","Triceps skin fold thickness(mm)","2-Hour serum insulin (mu U/ml)","Body mass index","Diabetes pedigree function","Age (years)"]
]
data_class1_labels = ["Iris Versicolour", "No Diabetes"]
for i in range(len(datasets)):
    print "--------"
    print "DATASET: ", datasets[i], "(", dataseturls[i], ")"
    data = fetch_mldata(datasets[i])
    y = data.target
    y[y>1] = 0
    y[y<0] = 0

    Xtrain, Xtest, ytrain, ytest = train_test_split(data.data, y)    
    
    clf = RuleListClassifier(max_iter=50000, n_chains=3, class1label=data_class1_labels[i], verbose=False)
    clf.fit(Xtrain, ytrain, feature_labels=data_feature_labels[i])
    
    print "accuracy:", clf.score(Xtest, ytest)
    print "rules:\n", clf
    print "Random Forest accuracy:", sklearn.ensemble.RandomForestClassifier().fit(Xtrain, ytrain).score(Xtest, ytest)