Beispiel #1
0
    def test_sums_detailed(self, data):
        it = FixedIterator(3)
        alg = Sums(iterator=it, priors=PriorBelief.FIXED)
        initial, first, second, third = alg.run_iter(data)

        assert initial.belief == {
            "x": {
                "one": 0.5
            },
            "y": {
                "nine": 0.5,
                "eight": 0.5
            },
            "z": {
                "seven": 0.5
            }
        }
        assert initial.trust == {"s1": 0, "s2": 0, "s3": 0}

        assert first.trust == {"s1": 1, "s2": 2 / 3, "s3": 1 / 3}
        assert first.belief == {
            "x": {
                "one": 1
            },
            "y": {
                "nine": 3 / 5,
                "eight": 2 / 5
            },
            "z": {
                "seven": 4 / 5
            }
        }
Beispiel #2
0
    def test_empty_dataset(self):
        data = Dataset([])
        non_it = MajorityVoting()
        it = Sums()
        for alg in [non_it, it]:
            with pytest.raises(EmptyDatasetError) as excinfo:
                alg.run(data)
        err_msg = "Cannot run algorithm on empty dataset"
        assert str(excinfo.value) == err_msg

        # Test with run_iter also
        with pytest.raises(EmptyDatasetError) as excinfo2:
            _l = list(it.run_iter(data))
        assert str(excinfo2.value) == err_msg
Beispiel #3
0
    def test_progress_bar(self, dataset):
        w = 200
        rend = GraphRenderer(width=w, backend=JsonBackend())
        anim = JsonAnimator(renderer=rend)

        buf = StringIO()
        it = FixedIterator(20)
        alg = Sums(iterator=it)
        anim.animate(buf, alg, dataset, show_progress=True)
        buf.seek(0)
        obj = json.load(buf)
        # Get the frame for the 5th iteration, which is 1 / 4 through
        frame = obj["frames"][5]
        rects = [
            e for e in frame["entities"]
            if e["type"] == "rectangle" and e["width"] != w
        ]
        assert len(rects) == 1
        assert rects[0]["x"] == 0
        assert rects[0]["width"] == w / 4

        # Test without progress
        buf2 = StringIO()
        anim.animate(buf2, alg, dataset, show_progress=False)
        buf2.seek(0)
        obj2 = json.load(buf2)
        frame2 = obj2["frames"][5]
        rects2 = [
            e for e in frame2["entities"]
            if e["type"] == "rectangle" and e["width"] != w
        ]
        assert not len(rects2)
    def test_num_iterations(self):
        data = Dataset([("source 1", "x", 7), ("source 2", "x", 8)])
        voting_res = MajorityVoting().run(data)
        assert voting_res.iterations is None

        sums_res = Sums(iterator=FixedIterator(13)).run(data)
        assert sums_res.iterations == 13
 def test_time_taken(self):
     """
     Test run time in Result objects for iterative and non-iterative
     algorithms
     """
     data = Dataset([("source 1", "x", 7), ("source 2", "x", 8)])
     res = MajorityVoting().run(data)
     assert res.time_taken == 5
     res = Sums().run(data)
     assert res.time_taken == 5
Beispiel #6
0
    def test_basic(self, data):
        """
        Perform Sums on a small graph. The expected results were obtained by
        finding eigenvectors of suitable matrices (using numpy "by hand"), as
        per Kleinberg paper for Hubs and Authorities
        """
        sums = Sums(iterator=ConvergenceIterator(DistanceMeasures.L1, 0.00001))
        results = sums.run(data)
        assert np.isclose(results.trust["s1"], 1)
        assert np.isclose(results.trust["s2"], 0.53208889)
        assert np.isclose(results.trust["s3"], 0.34729636)

        assert set(results.belief["x"].keys()) == {"one"}
        assert np.isclose(results.belief["x"]["one"], 1)

        assert set(results.belief["y"].keys()) == {"eight", "nine"}
        assert np.isclose(results.belief["y"]["nine"], 0.65270364)
        assert np.isclose(results.belief["y"]["eight"], 0.34729636)

        assert set(results.belief["z"].keys()) == {"seven"}
        assert np.isclose(results.belief["z"]["seven"], 0.87938524)
Beispiel #7
0
 def test_belief_stats(self, csv_dataset, csv_fileobj, capsys):
     self.run("run", "-a", "sums", "-f", csv_dataset, "-o", "belief_stats")
     results = yaml.safe_load(capsys.readouterr().out)["sums"]
     assert set(results.keys()) == {"belief_stats"}
     exp_belief_stats = (Sums().run(
         MatrixDataset.from_csv(csv_fileobj)).get_belief_stats())
     assert results["belief_stats"] == {
         var: {
             "mean": mean,
             "stddev": stddev
         }
         for var, (mean, stddev) in exp_belief_stats.items()
     }
Beispiel #8
0
    def test_get_output_obj(self, csv_fileobj):
        dataset = MatrixDataset.from_csv(csv_fileobj)
        alg = Sums(iterator=FixedIterator(5))
        # Default should be all fields if none are given, but not accuracy
        # unless supervised data given
        results = alg.run(dataset)
        out1 = BaseClient().get_output_obj(results)
        exp_keys = {
            f.value
            for f in OutputFields if f != OutputFields.ACCURACY
        }
        assert set(out1.keys()) == exp_keys

        sup_data = SupervisedData.from_csv(csv_fileobj)
        sup_results = alg.run(sup_data.data)
        out2 = BaseClient().get_output_obj(sup_results, sup_data=sup_data)
        assert set(out2.keys()) == {f.value for f in OutputFields}
        assert out2["trust"] == sup_results.trust
        assert out2["belief"] == sup_results.belief

        out3 = BaseClient().get_output_obj(results,
                                           output_fields=[OutputFields.TRUST])
        assert set(out3.keys()) == {"trust"}
Beispiel #9
0
    def test_gif_animation(self, dataset):
        w, h = 123, 95
        renderer = GraphRenderer(width=w, node_radius=10, spacing=5)
        animator = GifAnimator(renderer=renderer)
        alg = Sums()
        buf = BytesIO()
        animator.animate(buf, alg, dataset)
        buf.seek(0)
        assert is_valid_gif(buf)

        # Check dimensions are as expected
        buf.seek(0)
        img_data = imageio.imread(buf)
        got_w, got_h, _ = img_data.shape
        assert (got_h, got_w) == (w, h)
Beispiel #10
0
    def test_custom_output(self, csv_fileobj, csv_dataset, capsys):
        self.run("run", "-a", "sums", "-f", csv_dataset, "-o", "time")
        results = yaml.safe_load(capsys.readouterr().out)["sums"]
        assert set(results.keys()) == {"time"}

        self.run("run", "-a", "sums", "-f", csv_dataset, "-o", "time",
                 "iterations")
        results = yaml.safe_load(capsys.readouterr().out)["sums"]
        assert set(results.keys()) == {"time", "iterations"}

        self.run("run", "-a", "sums", "-f", csv_dataset, "-o", "trust",
                 "trust_stats")
        results = yaml.safe_load(capsys.readouterr().out)["sums"]
        assert set(results.keys()) == {"trust", "trust_stats"}
        exp_mean, exp_stddev = (Sums().run(
            MatrixDataset.from_csv(csv_fileobj)).get_trust_stats())
        assert results["trust_stats"] == {
            "mean": exp_mean,
            "stddev": exp_stddev
        }
Beispiel #11
0
    def test_json_animation(self, dataset):
        w, h = 123, 95
        renderer = GraphRenderer(width=w,
                                 node_radius=10,
                                 spacing=5,
                                 backend=JsonBackend())
        animator = JsonAnimator(renderer=renderer, frame_duration=1 / 9)
        alg = Sums(iterator=FixedIterator(4))
        buf = StringIO()
        animator.animate(buf, alg, dataset)
        buf.seek(0)
        obj = json.load(buf)

        assert "fps" in obj
        assert obj["fps"] == 9
        assert "frames" in obj
        assert isinstance(obj["frames"], list)
        assert len(obj["frames"]) == 5
        assert isinstance(obj["frames"][0], dict)
        assert "width" in obj["frames"][0]
        assert "height" in obj["frames"][0]
        assert "entities" in obj["frames"][0]
        assert obj["frames"][0]["width"] == w
        assert obj["frames"][0]["height"] == h
"""
from collections import OrderedDict
import itertools
import json
import sys

import numpy as np
import matplotlib.pyplot as plt

from truthdiscovery.input import SyntheticData
from truthdiscovery.algorithm import (AverageLog, Investment, MajorityVoting,
                                      PooledInvestment, Sums, TruthFinder)

ALGORITHMS = OrderedDict({
    "Voting": MajorityVoting(),
    "Sums": Sums(),
    "Average.Log": AverageLog(),
    "Investment": Investment(),
    "Pooled Investment": PooledInvestment(),
    "TruthFinder": TruthFinder()
})


class Experiment:
    # labels for values of independent variable
    labels = None
    # dict mapping algorithm labels to objects
    algorithms = None
    # number of trials to perform for each value
    reps = 10
    # parameters to pass to synthetic data generation. Value for independent
Beispiel #13
0
    def test_base(self, dataset):
        class MyAnimator(BaseAnimator):
            supported_backends = (PngBackend, )

        with pytest.raises(NotImplementedError):
            MyAnimator().animate(BytesIO(), Sums(), dataset)
Beispiel #14
0
 def test_results_based_valid_png(self, dataset, tmpdir):
     cs = ResultsGradientColourScheme(Sums().run(dataset))
     out = tmpdir.join("mygraph.png")
     GraphRenderer(backend=PngBackend(), colours=cs).render(dataset, out)
     with open(str(out), "rb") as f:
         assert is_valid_png(f)
Beispiel #15
0
 def test_sums(self, data):
     sums = Sums(iterator=FixedIterator(20))
     self.check_results(sums, data, "sums_results.json")
def main():
    # Show usage
    if len(sys.argv) > 1 and sys.argv[1] in ("-h", "--help"):
        usage()
        return

    dataset = None
    sup = None

    # Unpickle dataset from a file if only one argument given
    if len(sys.argv) == 2:
        print("unpickling data...")
        start = time.time()
        with open(sys.argv[1], "rb") as pickle_file:
            sup = pickle.load(pickle_file)
        end = time.time()
        print("  unpickled in {:.3f} seconds".format(end - start))
        dataset = sup.data

    elif len(sys.argv) == 3:
        data_path, truth_path = sys.argv[1:]
        print("loading data...")
        start = time.time()
        dataset = StockDataset(data_path)
        end = time.time()
        print("  loaded in {:.3f} seconds".format(end - start))

        print("loading true values...")
        start = time.time()
        sup = SupervisedStockData(dataset, truth_path)
        end = time.time()
        print("  loaded in {:.3f} seconds".format(end - start))

        pickle_path = "/tmp/stock_data.pickle"
        with open(pickle_path, "wb") as pickle_file:
            pickle.dump(sup, pickle_file)
        print("pickled to {}".format(pickle_path))

    else:
        usage(sys.stderr)
        sys.exit(1)

    print("")
    print("dataset has {} sources, {} claims, {} variables".format(
        dataset.num_sources, dataset.num_claims, dataset.num_variables
    ))

    start = time.time()
    print("calculating connected components...")
    components = dataset.num_connected_components()
    end = time.time()
    print("  calculated in {:.3f} seconds: {} components".format(
        end - start, components
    ))

    algorithms = [
        MajorityVoting(), Sums(), AverageLog(), Investment(),
        PooledInvestment(), TruthFinder()
    ]

    for alg in algorithms:
        print("running {}...".format(alg.__class__.__name__))
        res = alg.run(sup.data)
        acc = sup.get_accuracy(res)
        print("  {:.3f} seconds, {:.3f} accuracy".format(res.time_taken, acc))
Beispiel #17
0
    plain = False
    if len(sys.argv) == 3 and sys.argv[1] == "--plain":
        outpath = sys.argv[2]
        plain = True
    elif len(sys.argv) == 2:
        outpath = sys.argv[1]
    else:
        print("usage: {} [--plain] DEST".format(sys.argv[0]), file=sys.stderr)
        sys.exit(1)

    tuples = [
        ("source 1", "x", 4),
        ("source 1", "y", 7),
        ("source 2", "y", 7),
        ("source 2", "z", 5),
        ("source 3", "x", 3),
        ("source 3", "z", 5),
        ("source 4", "x", 3),
        ("source 4", "y", 6),
        ("source 4", "z", 8),
        ("my really long source name", "mylongvar", "extremelylongvalue"),
    ]
    mydata = Dataset(tuples)
    results = Sums().run(mydata)

    colour_scheme = (PlainColourScheme()
                     if plain else ResultsGradientColourScheme(results))
    renderer = GraphRenderer(width=1000, colours=colour_scheme)
    with open(outpath, "wb") as imgfile:
        renderer.render(mydata, imgfile)
from truthdiscovery.input import MatrixDataset, SyntheticData
from truthdiscovery.algorithm import (AverageLog, Investment, MajorityVoting,
                                      PooledInvestment, Sums, TruthFinder)

# Sizes for sources/variables to use in the experiments
DATA_SIZES = list(range(100, 2001, 200))
# The fixed size for the parameter that is not being varied
FIXED_SIZE = 500
# Parameters for synthetic data generation
CLAIM_PROBABILITY = 0.1
DOMAIN_SIZE = 4

ALGORITHMS = OrderedDict({
    "voting": MajorityVoting(),
    "sums": Sums(),
    "average.log": AverageLog(),
    "investment": Investment(),
    "Pooled Investment": PooledInvestment(),
    "TruthFinder": TruthFinder()
})


def generate_timings():
    print("generating large trust vector...", file=sys.stderr)
    max_size = max(DATA_SIZES)
    trust = np.random.uniform(size=(max_size, ))

    print("generating large dataset...", file=sys.stderr)
    large_synth = SyntheticData(trust,
                                num_variables=max_size,