Beispiel #1
0
    def generate_classifier(sensors):
        #create a source for the setting sensor=value, fill it with random observations
        def create_source(sensor, value, num_bins):
            random_observations = lambda: pandas.Series(
                numpy.random.randint(0, 100, len(targets)), index=targets)
            temporal = pandas.concat(
                [random_observations() for b in range(num_bins)], axis=1)
            total = random_observations()
            return Source(sensor, value, total, temporal)

        #initialize the classifier
        all_settings = [(sensor, value) for sensor in sensors.keys()
                        for value in sensors[sensor]]
        features = sorted(all_settings) + [
            "%s_timedelta" % sensor for sensor in sorted(sensors.keys())
        ]
        targets = [
            "%s=%s" % (sensor, value) for sensor, value in sorted(all_settings)
        ]
        cls = TemporalEvidencesClassifier(features, targets)

        #create a random sources for each possible setting
        cls.sources = {(sensor, value): create_source(sensor, value,
                                                      len(cls.bins))
                       for sensor, value in all_settings}
        cls.max_total = max(source.total_counts.sum()
                            for source in cls.sources.values())
        cls.max_temporal = max(source.max_temporal()
                               for source in cls.sources.values())

        return cls
def test_train():
    """
    Test that the classifier correctly extracts all observations from the test dataset.
    """
    #train the classifier
    data = load_dataset(data_file)
    cls = TemporalEvidencesClassifier(data.features, data.target_names)
    cls = cls.fit(data.data, data.target)

    #load expected sources and their observations from json file
    expected_sources = sources_from_json(sources_file)

    #compare expected with actual sources
    assert_array_equal(sorted(cls.sources.keys()), sorted(expected_sources.keys()),)
    for name in expected_sources.keys():
        assert_source_equal(cls.sources[name], expected_sources[name])
def test_recommend():
    """
    Test that the classifier generates the correct recommendations for the test dataset.
    """

    #train the classifier and calculate recommendations
    data = load_dataset(data_file)
    cls = TemporalEvidencesClassifier(data.features, data.target_names)
    cls = cls.fit(data.data, data.target)
    actual_recommendations = cls.predict(data.data, include_conflict_theta=True)

    #load expected results from json file
    with open(recommendations_file, 'r') as infile:
        expected_recommendations = json.load(infile)

    #compare expected with actual results
    for actual, expected in zip(actual_recommendations, expected_recommendations):
        assert_recommendations_equal(actual, expected)
def initialize_experiment():
    experiment = Experiment(data)
    experiment.add_classifier(TemporalEvidencesClassifier(
        data.features, data.target_names),
                              name="Our method")
    experiment.add_classifier(NaiveBayesClassifier(data.features,
                                                   data.target_names),
                              name="Naive Bayes")
    return experiment
Beispiel #5
0
def test_train():
    """
    Test that the classifier correctly extracts all observations from the test dataset.
    """
    #train the classifier
    data = load_dataset(data_file)
    cls = TemporalEvidencesClassifier(data.features, data.target_names)
    cls = cls.fit(data.data, data.target)

    #load expected sources and their observations from json file
    expected_sources = sources_from_json(sources_file)

    #compare expected with actual sources
    assert_array_equal(
        sorted(cls.sources.keys()),
        sorted(expected_sources.keys()),
    )
    for name in expected_sources.keys():
        assert_source_equal(cls.sources[name], expected_sources[name])
Beispiel #6
0
def test_recommend():
    """
    Test that the classifier generates the correct recommendations for the test dataset.
    """

    #train the classifier and calculate recommendations
    data = load_dataset(data_file)
    cls = TemporalEvidencesClassifier(data.features, data.target_names)
    cls = cls.fit(data.data, data.target)
    actual_recommendations = cls.predict(data.data,
                                         include_conflict_theta=True)

    #load expected results from json file
    with open(recommendations_file, 'r') as infile:
        expected_recommendations = json.load(infile)

    #compare expected with actual results
    for actual, expected in zip(actual_recommendations,
                                expected_recommendations):
        assert_recommendations_equal(actual, expected)
Beispiel #7
0
    def generate_classifier(sensors):
        #create a source for the setting sensor=value, fill it with random observations
        def create_source(sensor, value, num_bins):
            random_observations = lambda: pandas.Series(numpy.random.randint(0, 100, len(targets)), index=targets)
            temporal = pandas.concat([random_observations() for b in range(num_bins)], axis=1)
            total = random_observations()
            return Source(sensor, value, total, temporal)

        #initialize the classifier
        all_settings = [(sensor, value) for sensor in sensors.keys() for value in sensors[sensor]]
        features = sorted(all_settings) + ["%s_timedelta" % sensor for sensor in sorted(sensors.keys())]
        targets = ["%s=%s" % (sensor, value) for sensor, value in sorted(all_settings)]
        cls = TemporalEvidencesClassifier(features, targets)

        #create a random sources for each possible setting
        cls.sources = {(sensor, value): create_source(sensor, value, len(cls.bins))
                       for sensor, value in all_settings}
        cls.max_total = max(source.total_counts.sum() for source in cls.sources.values())
        cls.max_temporal = max(source.max_temporal() for source in cls.sources.values())

        return cls
sys.path.append("..") 

import pandas

from recsys.classifiers.temporal import TemporalEvidencesClassifier
from recsys.dataset import load_dataset
from evaluation.metrics import results_as_dataframe
from evaluation import plot
import config


#configuration
data = load_dataset("../datasets/houseA.csv", "../datasets/houseA.config")

#run the classifier on the whole dataset
cls = TemporalEvidencesClassifier(data.features, data.target_names)
cls = cls.fit(data.data, data.target)
results = cls.predict(data.data, include_conflict_theta=True)

#extract conflict and uncertainty and convert recommendations to pandas representation
recommendations, conflict, uncertainty = zip(*results)
results = results_as_dataframe(data.target, list(recommendations))

#for each row, mark correct recommendations with "1", false recommendations with "0"
find_matches_in_row = lambda row: [1 if col == row.name else 0 for col in row]
results = results.apply(find_matches_in_row, axis=1)

#set uncertainty and conflict as multi-index
results.index = pandas.MultiIndex.from_tuples(zip(conflict, uncertainty),
                                              names=["Conflict", "Uncertainty"])
def houseB():
    """
    This dataset is partially dominated by one of the sensors, which makes the evaluation results less statistically
    sound, e.g. it leads to large confidence intervals when running 10-fold cross-validation.  
    """
    data = load_dataset("../datasets/houseB.csv", "../datasets/houseB.config")
    cutoff_results_at = 15
    return data, cutoff_results_at


#configuration
data, cutoff_results_at = houseA()

#run several classifiers on the same dataset, use 10-fold cross-validation
experiment = Experiment(data)
experiment.add_classifier(TemporalEvidencesClassifier(data.features,
                                                      data.target_names),
                          name="Our method")
experiment.add_classifier(NaiveBayesClassifier(data.features,
                                               data.target_names),
                          name="Naive Bayes")
experiment.add_classifier(RandomClassifier(data.features, data.target_names),
                          name="Random")
results = experiment.run(folds=10)

#print and plot results
results.print_quality_comparison_at_cutoff(
    cutoff=1, metrics=["Recall", "Precision", "F1"])
results.print_runtime_comparison()
plot_conf = plot.plot_config(config.plot_directory,
                             sub_dirs=[data.name],
                             img_type=config.img_type)
Beispiel #10
0
sys.path.append("..") 

import pandas

from evaluation.experiment import Experiment
from evaluation.metrics import quality_metrics
from recsys.classifiers.temporal import TemporalEvidencesClassifier, configure_dynamic_cutoff
from recsys.dataset import load_dataset


#configuration
data = load_dataset("../datasets/houseA.csv", "../datasets/houseA.config")
methods_to_test = [("Fixed cutoff", None),
                   ("dynamic cutoff=4", configure_dynamic_cutoff(1.0, 0.4, 4)),
                   ("dynamic cutoff=2", configure_dynamic_cutoff(1.0, 0.4, 2))]

#run all configured cutoffs with 10-fold cross-validation
experiment = Experiment(data)
for name, method in methods_to_test:
    experiment.add_classifier(TemporalEvidencesClassifier(data.features, data.target_names,
                              postprocess=method), name=name)
results = experiment.run(folds=10)

#print results
pandas.set_option('expand_frame_repr', False)
pandas.set_option('max_columns', 4)
print "Maximum 5 recommendations"
results.print_quality_comparison_at_cutoff(cutoff=5, metrics=quality_metrics)
print "Maximum 10 recommendations"
results.print_quality_comparison_at_cutoff(cutoff=10, metrics=quality_metrics)
sys.path.append("..")

import pandas

from recsys.classifiers.temporal import TemporalEvidencesClassifier
from recsys.classifiers.binning import initialize_bins
from recsys.dataset import load_dataset
from evaluation import plot
import config

#configuration
data = load_dataset("../datasets/houseA.csv", "../datasets/houseA.config")

#fit classifier to dataset
cls = TemporalEvidencesClassifier(data.features,
                                  data.target_names,
                                  bins=initialize_bins(0, 300, 10))
cls = cls.fit(data.data, data.target)

#create visualizations of habits around each user action
plot_conf = plot.plot_config(config.plot_directory,
                             sub_dirs=[data.name, "habits"],
                             img_type=config.img_type)
for source in cls.sources.values():
    observations = pandas.DataFrame(source.temporal_counts)
    observations.columns = data.target_names
    observations.index = cls.bins
    plot.plot_observations(source.name(), observations, plot_conf)

print "Results can be found in the \"%s\" directory" % config.plot_directory
Beispiel #12
0
sys.path.append("..")

import pandas

from recsys.classifiers.temporal import TemporalEvidencesClassifier
from recsys.classifiers.bayes import NaiveBayesClassifier
from recsys.dataset import load_dataset
from evaluation import plot
from evaluation.metrics import QualityMetricsCalculator
import config

#configuration
data = load_dataset("../datasets/houseA.csv", "../datasets/houseA.config")
classifiers = [
    NaiveBayesClassifier(data.features, data.target_names),
    TemporalEvidencesClassifier(data.features, data.target_names)
]

#run the experiment using full dataset as training and as test data
results = []
for cls in classifiers:
    cls = cls.fit(data.data, data.target)
    r = cls.predict(data.data)
    r = QualityMetricsCalculator(data.target, r)
    results.append(r.true_positives_for_all())

#want for each classifier result only the measurements for cutoff=1
results = [r.loc[1] for r in results]
results = pandas.concat(results, axis=1)
results.columns = [cls.name for cls in classifiers]
sys.path.append("..")

import pandas

from recsys.classifiers.temporal import TemporalEvidencesClassifier
from recsys.dataset import load_dataset
from evaluation.metrics import QualityMetricsCalculator


# configuration
data = load_dataset("../datasets/houseA.csv", "../datasets/houseA.config")
# data = load_dataset("../datasets/houseB.csv", "../datasets/houseB.config")

# run the classifier on the whole dataset and calculate confusion matrix
cls = TemporalEvidencesClassifier(data.features, data.target_names)
cls = cls.fit(data.data, data.target)
results = cls.predict(data.data)
matrix = QualityMetricsCalculator(data.target, results).confusion_matrix()

# format confusion matrix for pretty printing
letters = list(map(chr, list(range(97, 123)))) + list(map(chr, list(range(65, 91))))
action_to_letter = {action: letter for action, letter in zip(matrix.index, letters)}
matrix.columns = [action_to_letter[action] for action in matrix.columns]
matrix.index = ["(%s) %s" % (action_to_letter[action], action) for action in matrix.index]
matrix.index.name = "Actual action"

pandas.set_option("expand_frame_repr", False)
pandas.set_option("max_columns", 40)
print matrix
Beispiel #14
0
import pandas

from recsys.classifiers.temporal import TemporalEvidencesClassifier
from recsys.classifiers.bayes import NaiveBayesClassifier
from recsys.dataset import load_dataset
from evaluation import plot
from evaluation.metrics import QualityMetricsCalculator
import config

#configuration
data = load_dataset("../datasets/houseA.csv", "../datasets/houseA.config")
to_compare = [1, 2, 3, 4]

#run classifier and count true positives
cls = TemporalEvidencesClassifier(data.features, data.target_names)
cls = cls.fit(data.data, data.target)
results = cls.predict(data.data)
results = QualityMetricsCalculator(data.target,
                                   results).true_positives_for_all()

#only use the interesting cutoffs
results = results.transpose()[to_compare]
results.columns = ["cutoff=%s" % c for c in results.columns]

conf = plot.plot_config(config.plot_directory,
                        sub_dirs=[data.name],
                        prefix="histogram_cutoffs",
                        img_type=config.img_type)
plot.comparison_histogram(results, conf)
print "Results can be found in the \"%s\" directory" % config.plot_directory
of the figure still stands: the user has some observable habits after closing the frontdoor.
"""

import sys
sys.path.append("..") 

import pandas

from recsys.classifiers.temporal import TemporalEvidencesClassifier
from recsys.classifiers.binning import initialize_bins
from recsys.dataset import load_dataset
from evaluation import plot
import config

#configuration
data = load_dataset("../datasets/houseA.csv", "../datasets/houseA.config")

#fit classifier to dataset
cls = TemporalEvidencesClassifier(data.features, data.target_names, bins=initialize_bins(0, 300, 10))
cls = cls.fit(data.data, data.target)

#create visualizations of habits around each user action
plot_conf = plot.plot_config(config.plot_directory, sub_dirs=[data.name, "habits"], img_type=config.img_type)
for source in cls.sources.values():
    observations = pandas.DataFrame(source.temporal_counts)
    observations.columns = data.target_names
    observations.index = cls.bins
    plot.plot_observations(source.name(), observations, plot_conf)
    
print "Results can be found in the \"%s\" directory" % config.plot_directory
Beispiel #16
0
sys.path.append("..")

import pandas

from recsys.classifiers.temporal import TemporalEvidencesClassifier
from recsys.dataset import load_dataset
from evaluation.metrics import results_as_dataframe
from evaluation import plot
import config

#configuration
data = load_dataset("../datasets/houseA.csv", "../datasets/houseA.config")

#run the classifier on the whole dataset
cls = TemporalEvidencesClassifier(data.features, data.target_names)
cls = cls.fit(data.data, data.target)
results = cls.predict(data.data, include_conflict_theta=True)

#extract conflict and uncertainty and convert recommendations to pandas representation
recommendations, conflict, uncertainty = zip(*results)
results = results_as_dataframe(data.target, list(recommendations))

#for each row, mark correct recommendations with "1", false recommendations with "0"
find_matches_in_row = lambda row: [1 if col == row.name else 0 for col in row]
results = results.apply(find_matches_in_row, axis=1)

#set uncertainty and conflict as multi-index
results.index = pandas.MultiIndex.from_tuples(
    zip(conflict, uncertainty), names=["Conflict", "Uncertainty"])