def generate_classifier(sensors): #create a source for the setting sensor=value, fill it with random observations def create_source(sensor, value, num_bins): random_observations = lambda: pandas.Series( numpy.random.randint(0, 100, len(targets)), index=targets) temporal = pandas.concat( [random_observations() for b in range(num_bins)], axis=1) total = random_observations() return Source(sensor, value, total, temporal) #initialize the classifier all_settings = [(sensor, value) for sensor in sensors.keys() for value in sensors[sensor]] features = sorted(all_settings) + [ "%s_timedelta" % sensor for sensor in sorted(sensors.keys()) ] targets = [ "%s=%s" % (sensor, value) for sensor, value in sorted(all_settings) ] cls = TemporalEvidencesClassifier(features, targets) #create a random sources for each possible setting cls.sources = {(sensor, value): create_source(sensor, value, len(cls.bins)) for sensor, value in all_settings} cls.max_total = max(source.total_counts.sum() for source in cls.sources.values()) cls.max_temporal = max(source.max_temporal() for source in cls.sources.values()) return cls
def test_train(): """ Test that the classifier correctly extracts all observations from the test dataset. """ #train the classifier data = load_dataset(data_file) cls = TemporalEvidencesClassifier(data.features, data.target_names) cls = cls.fit(data.data, data.target) #load expected sources and their observations from json file expected_sources = sources_from_json(sources_file) #compare expected with actual sources assert_array_equal(sorted(cls.sources.keys()), sorted(expected_sources.keys()),) for name in expected_sources.keys(): assert_source_equal(cls.sources[name], expected_sources[name])
def test_recommend(): """ Test that the classifier generates the correct recommendations for the test dataset. """ #train the classifier and calculate recommendations data = load_dataset(data_file) cls = TemporalEvidencesClassifier(data.features, data.target_names) cls = cls.fit(data.data, data.target) actual_recommendations = cls.predict(data.data, include_conflict_theta=True) #load expected results from json file with open(recommendations_file, 'r') as infile: expected_recommendations = json.load(infile) #compare expected with actual results for actual, expected in zip(actual_recommendations, expected_recommendations): assert_recommendations_equal(actual, expected)
def initialize_experiment(): experiment = Experiment(data) experiment.add_classifier(TemporalEvidencesClassifier( data.features, data.target_names), name="Our method") experiment.add_classifier(NaiveBayesClassifier(data.features, data.target_names), name="Naive Bayes") return experiment
def test_train(): """ Test that the classifier correctly extracts all observations from the test dataset. """ #train the classifier data = load_dataset(data_file) cls = TemporalEvidencesClassifier(data.features, data.target_names) cls = cls.fit(data.data, data.target) #load expected sources and their observations from json file expected_sources = sources_from_json(sources_file) #compare expected with actual sources assert_array_equal( sorted(cls.sources.keys()), sorted(expected_sources.keys()), ) for name in expected_sources.keys(): assert_source_equal(cls.sources[name], expected_sources[name])
def generate_classifier(sensors): #create a source for the setting sensor=value, fill it with random observations def create_source(sensor, value, num_bins): random_observations = lambda: pandas.Series(numpy.random.randint(0, 100, len(targets)), index=targets) temporal = pandas.concat([random_observations() for b in range(num_bins)], axis=1) total = random_observations() return Source(sensor, value, total, temporal) #initialize the classifier all_settings = [(sensor, value) for sensor in sensors.keys() for value in sensors[sensor]] features = sorted(all_settings) + ["%s_timedelta" % sensor for sensor in sorted(sensors.keys())] targets = ["%s=%s" % (sensor, value) for sensor, value in sorted(all_settings)] cls = TemporalEvidencesClassifier(features, targets) #create a random sources for each possible setting cls.sources = {(sensor, value): create_source(sensor, value, len(cls.bins)) for sensor, value in all_settings} cls.max_total = max(source.total_counts.sum() for source in cls.sources.values()) cls.max_temporal = max(source.max_temporal() for source in cls.sources.values()) return cls
sys.path.append("..") import pandas from recsys.classifiers.temporal import TemporalEvidencesClassifier from recsys.dataset import load_dataset from evaluation.metrics import results_as_dataframe from evaluation import plot import config #configuration data = load_dataset("../datasets/houseA.csv", "../datasets/houseA.config") #run the classifier on the whole dataset cls = TemporalEvidencesClassifier(data.features, data.target_names) cls = cls.fit(data.data, data.target) results = cls.predict(data.data, include_conflict_theta=True) #extract conflict and uncertainty and convert recommendations to pandas representation recommendations, conflict, uncertainty = zip(*results) results = results_as_dataframe(data.target, list(recommendations)) #for each row, mark correct recommendations with "1", false recommendations with "0" find_matches_in_row = lambda row: [1 if col == row.name else 0 for col in row] results = results.apply(find_matches_in_row, axis=1) #set uncertainty and conflict as multi-index results.index = pandas.MultiIndex.from_tuples(zip(conflict, uncertainty), names=["Conflict", "Uncertainty"])
def houseB(): """ This dataset is partially dominated by one of the sensors, which makes the evaluation results less statistically sound, e.g. it leads to large confidence intervals when running 10-fold cross-validation. """ data = load_dataset("../datasets/houseB.csv", "../datasets/houseB.config") cutoff_results_at = 15 return data, cutoff_results_at #configuration data, cutoff_results_at = houseA() #run several classifiers on the same dataset, use 10-fold cross-validation experiment = Experiment(data) experiment.add_classifier(TemporalEvidencesClassifier(data.features, data.target_names), name="Our method") experiment.add_classifier(NaiveBayesClassifier(data.features, data.target_names), name="Naive Bayes") experiment.add_classifier(RandomClassifier(data.features, data.target_names), name="Random") results = experiment.run(folds=10) #print and plot results results.print_quality_comparison_at_cutoff( cutoff=1, metrics=["Recall", "Precision", "F1"]) results.print_runtime_comparison() plot_conf = plot.plot_config(config.plot_directory, sub_dirs=[data.name], img_type=config.img_type)
sys.path.append("..") import pandas from evaluation.experiment import Experiment from evaluation.metrics import quality_metrics from recsys.classifiers.temporal import TemporalEvidencesClassifier, configure_dynamic_cutoff from recsys.dataset import load_dataset #configuration data = load_dataset("../datasets/houseA.csv", "../datasets/houseA.config") methods_to_test = [("Fixed cutoff", None), ("dynamic cutoff=4", configure_dynamic_cutoff(1.0, 0.4, 4)), ("dynamic cutoff=2", configure_dynamic_cutoff(1.0, 0.4, 2))] #run all configured cutoffs with 10-fold cross-validation experiment = Experiment(data) for name, method in methods_to_test: experiment.add_classifier(TemporalEvidencesClassifier(data.features, data.target_names, postprocess=method), name=name) results = experiment.run(folds=10) #print results pandas.set_option('expand_frame_repr', False) pandas.set_option('max_columns', 4) print "Maximum 5 recommendations" results.print_quality_comparison_at_cutoff(cutoff=5, metrics=quality_metrics) print "Maximum 10 recommendations" results.print_quality_comparison_at_cutoff(cutoff=10, metrics=quality_metrics)
sys.path.append("..") import pandas from recsys.classifiers.temporal import TemporalEvidencesClassifier from recsys.classifiers.binning import initialize_bins from recsys.dataset import load_dataset from evaluation import plot import config #configuration data = load_dataset("../datasets/houseA.csv", "../datasets/houseA.config") #fit classifier to dataset cls = TemporalEvidencesClassifier(data.features, data.target_names, bins=initialize_bins(0, 300, 10)) cls = cls.fit(data.data, data.target) #create visualizations of habits around each user action plot_conf = plot.plot_config(config.plot_directory, sub_dirs=[data.name, "habits"], img_type=config.img_type) for source in cls.sources.values(): observations = pandas.DataFrame(source.temporal_counts) observations.columns = data.target_names observations.index = cls.bins plot.plot_observations(source.name(), observations, plot_conf) print "Results can be found in the \"%s\" directory" % config.plot_directory
sys.path.append("..") import pandas from recsys.classifiers.temporal import TemporalEvidencesClassifier from recsys.classifiers.bayes import NaiveBayesClassifier from recsys.dataset import load_dataset from evaluation import plot from evaluation.metrics import QualityMetricsCalculator import config #configuration data = load_dataset("../datasets/houseA.csv", "../datasets/houseA.config") classifiers = [ NaiveBayesClassifier(data.features, data.target_names), TemporalEvidencesClassifier(data.features, data.target_names) ] #run the experiment using full dataset as training and as test data results = [] for cls in classifiers: cls = cls.fit(data.data, data.target) r = cls.predict(data.data) r = QualityMetricsCalculator(data.target, r) results.append(r.true_positives_for_all()) #want for each classifier result only the measurements for cutoff=1 results = [r.loc[1] for r in results] results = pandas.concat(results, axis=1) results.columns = [cls.name for cls in classifiers]
sys.path.append("..") import pandas from recsys.classifiers.temporal import TemporalEvidencesClassifier from recsys.dataset import load_dataset from evaluation.metrics import QualityMetricsCalculator # configuration data = load_dataset("../datasets/houseA.csv", "../datasets/houseA.config") # data = load_dataset("../datasets/houseB.csv", "../datasets/houseB.config") # run the classifier on the whole dataset and calculate confusion matrix cls = TemporalEvidencesClassifier(data.features, data.target_names) cls = cls.fit(data.data, data.target) results = cls.predict(data.data) matrix = QualityMetricsCalculator(data.target, results).confusion_matrix() # format confusion matrix for pretty printing letters = list(map(chr, list(range(97, 123)))) + list(map(chr, list(range(65, 91)))) action_to_letter = {action: letter for action, letter in zip(matrix.index, letters)} matrix.columns = [action_to_letter[action] for action in matrix.columns] matrix.index = ["(%s) %s" % (action_to_letter[action], action) for action in matrix.index] matrix.index.name = "Actual action" pandas.set_option("expand_frame_repr", False) pandas.set_option("max_columns", 40) print matrix
import pandas from recsys.classifiers.temporal import TemporalEvidencesClassifier from recsys.classifiers.bayes import NaiveBayesClassifier from recsys.dataset import load_dataset from evaluation import plot from evaluation.metrics import QualityMetricsCalculator import config #configuration data = load_dataset("../datasets/houseA.csv", "../datasets/houseA.config") to_compare = [1, 2, 3, 4] #run classifier and count true positives cls = TemporalEvidencesClassifier(data.features, data.target_names) cls = cls.fit(data.data, data.target) results = cls.predict(data.data) results = QualityMetricsCalculator(data.target, results).true_positives_for_all() #only use the interesting cutoffs results = results.transpose()[to_compare] results.columns = ["cutoff=%s" % c for c in results.columns] conf = plot.plot_config(config.plot_directory, sub_dirs=[data.name], prefix="histogram_cutoffs", img_type=config.img_type) plot.comparison_histogram(results, conf) print "Results can be found in the \"%s\" directory" % config.plot_directory
of the figure still stands: the user has some observable habits after closing the frontdoor. """ import sys sys.path.append("..") import pandas from recsys.classifiers.temporal import TemporalEvidencesClassifier from recsys.classifiers.binning import initialize_bins from recsys.dataset import load_dataset from evaluation import plot import config #configuration data = load_dataset("../datasets/houseA.csv", "../datasets/houseA.config") #fit classifier to dataset cls = TemporalEvidencesClassifier(data.features, data.target_names, bins=initialize_bins(0, 300, 10)) cls = cls.fit(data.data, data.target) #create visualizations of habits around each user action plot_conf = plot.plot_config(config.plot_directory, sub_dirs=[data.name, "habits"], img_type=config.img_type) for source in cls.sources.values(): observations = pandas.DataFrame(source.temporal_counts) observations.columns = data.target_names observations.index = cls.bins plot.plot_observations(source.name(), observations, plot_conf) print "Results can be found in the \"%s\" directory" % config.plot_directory
sys.path.append("..") import pandas from recsys.classifiers.temporal import TemporalEvidencesClassifier from recsys.dataset import load_dataset from evaluation.metrics import results_as_dataframe from evaluation import plot import config #configuration data = load_dataset("../datasets/houseA.csv", "../datasets/houseA.config") #run the classifier on the whole dataset cls = TemporalEvidencesClassifier(data.features, data.target_names) cls = cls.fit(data.data, data.target) results = cls.predict(data.data, include_conflict_theta=True) #extract conflict and uncertainty and convert recommendations to pandas representation recommendations, conflict, uncertainty = zip(*results) results = results_as_dataframe(data.target, list(recommendations)) #for each row, mark correct recommendations with "1", false recommendations with "0" find_matches_in_row = lambda row: [1 if col == row.name else 0 for col in row] results = results.apply(find_matches_in_row, axis=1) #set uncertainty and conflict as multi-index results.index = pandas.MultiIndex.from_tuples( zip(conflict, uncertainty), names=["Conflict", "Uncertainty"])