def test_create(self): # From a regular numpy array normal_arr = np.ones((2, 3)) MatrixDataset(normal_arr) # From masked array masked_arr = ma.array(normal_arr, mask=[[True, False, True], [False, True, False]]) MatrixDataset(masked_arr)
def test_export_to_csv(self): data = MatrixDataset( ma.masked_values( [ # All full row [1, 2, 3, 4, 5, 6, 7, 8], # Mixed row [1, 2, 0, -123, 4, -2.3, 99.123, -123], # All empty row [-123, -123, -123, -123, -123, -123, -123, -123] ], -123)) expected = "\n".join(("1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0", "1.0,2.0,0.0,,4.0,-2.3,99.123,", ",,,,,,,")) assert data.to_csv() == expected
def generate_graph(self, args, parser): try: renderer = self.get_graph_renderer(args) dataset = MatrixDataset.from_csv(args.dataset) except ValueError as ex: # pragma: no cover parser.error(ex) renderer.render(dataset, args.outfile)
def test_mutual_exclusion_matrix(self): data = MatrixDataset( ma.masked_values( [[7, 4, 7], [5, 1, -1], [-1, 2, 4], [7, -1, 2], [-1, 1, 2]], -1)) # Claims are: # 0: x=7 # 1: y=4 # 2: z=7 # 3: x=5 # 4: y=1 # 5: y=2 # 6: z=4 # 7: z=2 # Mutual exclusion groups are {0, 3}, {1, 4, 5}, {2, 6, 7} expected_mut_ex_mat = np.array([ [1, 0, 0, 1, 0, 0, 0, 0], [0, 1, 0, 0, 1, 1, 0, 0], [0, 0, 1, 0, 0, 0, 1, 1], [1, 0, 0, 1, 0, 0, 0, 0], [0, 1, 0, 0, 1, 1, 0, 0], [0, 1, 0, 0, 1, 1, 0, 0], [0, 0, 1, 0, 0, 0, 1, 1], [0, 0, 1, 0, 0, 0, 1, 1], ]) assert data.mut_ex.shape == expected_mut_ex_mat.shape assert np.array_equal(data.mut_ex.toarray(), expected_mut_ex_mat)
def perform_test(master_sv, data_shapes): """ Perform a number of timing tests :param master_sv: source-variabels matrix to extract smaller datasets from :param data_shapes: an iterable of (num_sources, num_variables) for the sizes of data to test with :return: an OrderedDict {alg_label: list_of_timings, ...} """ timings = OrderedDict() for num_sources, num_vars in data_shapes: print("getting reduced dataset: {} sources, {} variables...".format( num_sources, num_vars), file=sys.stderr) data = MatrixDataset(master_sv[0:num_sources, 0:num_vars]) for alg_label, alg in ALGORITHMS.items(): print(" running {}...".format(alg_label), end="", file=sys.stderr) res = alg.run(data) print(" {:.3f} seconds".format(res.time_taken), file=sys.stderr) if alg_label not in timings: timings[alg_label] = [] timings[alg_label].append(res.time_taken) return timings
def run_algorithm(self, args, parser): alg_objs = [] all_params = dict(args.alg_params or []) for cls in args.alg_classes: params, ignored = self.get_algorithm_params(cls, all_params) alg_objs.append(self.get_algorithm_object(cls, params)) if ignored: msg = self.get_ignored_parameters_message(cls, ignored) print("WARNING: {}".format(msg), file=sys.stderr) sup_data = None dataset = None if args.supervised: sup_data = SupervisedData.from_csv(args.dataset) dataset = sup_data.data else: # Catch error early if accuracy requested in output but dataset is # not supervised if OutputFields.ACCURACY in args.output_fields: parser.error("cannot calculate accuracy without --supervised") dataset = MatrixDataset.from_csv(args.dataset) output_obj = {} for alg in alg_objs: results = alg.run(dataset).filter(sources=args.sources, variables=args.variables) # Get results to display label = self.ALG_LABEL_MAPPING.inverse[alg.__class__] output_obj[label] = self.get_output_obj( results, output_fields=args.output_fields, sup_data=sup_data) print(yaml.dump(output_obj, indent=2, default_flow_style=False))
def test_from_csv_empty_rows(self, tmpdir): filepath = tmpdir.join("data.csv") filepath.write("\n".join(["1,2,", ",,", " ,\t,", "3,4,5"])) data = MatrixDataset.from_csv(filepath.open()) expected_matrix = ma.masked_values( [[1, 2, 0], [0, 0, 0], [0, 0, 0], [3, 4, 5]], 0) assert np.array_equal(data.sv.mask, expected_matrix.mask) assert (data.sv == expected_matrix).all()
def test_truthfinder(self, data): it = ConvergenceIterator(DistanceMeasures.COSINE, 0.001) truthfinder = TruthFinder(iterator=it) def imp(var, val1, val2): diff = val1 - val2 return np.exp(-0.5 * diff**2) data = MatrixDataset(data.sv, implication_function=imp) self.check_results(truthfinder, data, "truthfinder_results.json")
def test_results(self, csv_dataset, csv_fileobj, capsys): self.run("run", "-a", "average_log", "-f", csv_dataset) got_results = yaml.safe_load(capsys.readouterr().out) assert "average_log" in got_results alg_results = got_results["average_log"] assert isinstance(alg_results, dict) exp_results = AverageLog().run(MatrixDataset.from_csv(csv_fileobj)) assert alg_results["trust"] == exp_results.trust assert alg_results["belief"] == exp_results.belief assert alg_results["iterations"] == exp_results.iterations
def test_from_csv_single_row_or_column(self, tmpdir): filepath1 = tmpdir.join("data1.csv") filepath1.write("1,,3,2,6") data1 = MatrixDataset.from_csv(filepath1.open()) exp_sv1 = ma.masked_values([[1, 0, 3, 2, 6]], 0) assert data1.num_sources == 1 assert data1.num_variables == 4 assert data1.num_claims == 4 assert np.array_equal(data1.sv.mask, exp_sv1.mask) assert (data1.sv == exp_sv1).all() filepath2 = tmpdir.join("data2.csv") filepath2.write("1\n\n3\n2\n6") data2 = MatrixDataset.from_csv(filepath2.open()) exp_sv2 = exp_sv1.T assert data2.num_sources == 4 assert data2.num_variables == 1 assert data2.num_claims == 4 assert np.array_equal(data2.sv.mask, exp_sv2.mask) assert (data2.sv == exp_sv2).all()
def test_claims_matrix(self): data = MatrixDataset( ma.masked_values( [[7, 4, 7], [5, 1, -1], [-1, 2, 4], [7, -1, 2], [-1, 1, 2]], -1)) expected_claim_mat = np.array([[1, 1, 1, 0, 0, 0, 0, 0], [0, 0, 0, 1, 1, 0, 0, 0], [0, 0, 0, 0, 0, 1, 1, 0], [1, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 1, 0, 0, 1]]) assert data.sc.shape == expected_claim_mat.shape assert np.array_equal(data.sc.toarray(), expected_claim_mat)
def test_belief_stats(self, csv_dataset, csv_fileobj, capsys): self.run("run", "-a", "sums", "-f", csv_dataset, "-o", "belief_stats") results = yaml.safe_load(capsys.readouterr().out)["sums"] assert set(results.keys()) == {"belief_stats"} exp_belief_stats = (Sums().run( MatrixDataset.from_csv(csv_fileobj)).get_belief_stats()) assert results["belief_stats"] == { var: { "mean": mean, "stddev": stddev } for var, (mean, stddev) in exp_belief_stats.items() }
def test_trust_invalid(self): """ In theory trust scores cannot be 1 for any source. In practise scores get so close to 1 that they are rounded to 1, which causes problems when we do log(1 - trust). This test checks that iteration stops in this case """ data = MatrixDataset( np.array([[1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2, 3]])) it = FixedIterator(100) alg = TruthFinder(iterator=it) res = alg.run(data) # Iteration should stop after only 7 iterations, instead of 100 assert it.it_count == 7 assert res.iterations == 7
def test_custom_output(self, csv_fileobj, csv_dataset, capsys): self.run("run", "-a", "sums", "-f", csv_dataset, "-o", "time") results = yaml.safe_load(capsys.readouterr().out)["sums"] assert set(results.keys()) == {"time"} self.run("run", "-a", "sums", "-f", csv_dataset, "-o", "time", "iterations") results = yaml.safe_load(capsys.readouterr().out)["sums"] assert set(results.keys()) == {"time", "iterations"} self.run("run", "-a", "sums", "-f", csv_dataset, "-o", "trust", "trust_stats") results = yaml.safe_load(capsys.readouterr().out)["sums"] assert set(results.keys()) == {"trust", "trust_stats"} exp_mean, exp_stddev = (Sums().run( MatrixDataset.from_csv(csv_fileobj)).get_trust_stats()) assert results["trust_stats"] == { "mean": exp_mean, "stddev": exp_stddev }
def test_from_csv(self, tmpdir): filepath = tmpdir.join("data.csv") filepath.write("\n".join([ "1,,3, 2,6 ", # extra whitespace should not matter ", 9,0,2,5", "3,9, ,,1", "1,9 , 5.7,3,4", "5,1,3,1,1", "\n" # new lines at the end of file should not matter ])) data = MatrixDataset.from_csv(filepath.open()) expected_matrix = ma.masked_values( [[1, 999, 3, 2, 6], [999, 9, 0, 2, 5], [3, 9, 999, 999, 1], [1, 9, 5.7, 3, 4], [5, 1, 3, 1, 1]], 999) assert data.num_sources == 5 assert data.num_variables == 5 assert data.num_claims == 15 assert np.array_equal(data.sv.mask, expected_matrix.mask) assert (data.sv == expected_matrix).all()
def main(): """ Run an algorithm and print results """ data = MatrixDataset( ma.masked_values( [[1, 4, 0, 5, 7, 0], [0, 2, 0, 4, 9, 0], [1, 0, 2, 3, 8, 4], [4, 0, 2, 5, 0, 0], [4, 0, 3, 3, 4, 4]], 0)) alg = AverageLog() results = alg.run(data) print("trust scores:") for source, trust_val in results.trust.items(): print(" source {}: {:.3f}".format(source, trust_val)) print("") print("belief scores:") for var in sorted(results.belief): print(" variable {}:".format(var)) beliefs = results.belief[var] for val in sorted(beliefs): print(" {}: {:.3f}".format(val, beliefs[val]))
def test_matrix_renderer(self): buf = StringIO() buf.write(",5,7\n,,\n1,2,3") buf.seek(0) dataset = MatrixDataset.from_csv(buf) rend1 = MatrixDatasetGraphRenderer() rend2 = MatrixDatasetGraphRenderer(zero_indexed=False) rend1.render(dataset, BytesIO()) rend2.render(dataset, BytesIO()) assert rend1.get_source_label(0) == "s0" assert rend2.get_source_label(0) == "s1" assert rend1.get_var_label(0) == "v1" assert rend2.get_var_label(0) == "v2" # Note that source 1 (in 0-index terms) makes no claims: ID 1 should # therefore be source 2 (in 0-index terms) assert rend1.get_source_label(1) == "s2" assert rend2.get_source_label(1) == "s3" assert rend1.get_claim_label(0, 1) == "v1=7" assert rend2.get_claim_label(0, 1) == "v2=7"
def test_get_output_obj(self, csv_fileobj): dataset = MatrixDataset.from_csv(csv_fileobj) alg = Sums(iterator=FixedIterator(5)) # Default should be all fields if none are given, but not accuracy # unless supervised data given results = alg.run(dataset) out1 = BaseClient().get_output_obj(results) exp_keys = { f.value for f in OutputFields if f != OutputFields.ACCURACY } assert set(out1.keys()) == exp_keys sup_data = SupervisedData.from_csv(csv_fileobj) sup_results = alg.run(sup_data.data) out2 = BaseClient().get_output_obj(sup_results, sup_data=sup_data) assert set(out2.keys()) == {f.value for f in OutputFields} assert out2["trust"] == sup_results.trust assert out2["belief"] == sup_results.belief out3 = BaseClient().get_output_obj(results, output_fields=[OutputFields.TRUST]) assert set(out3.keys()) == {"trust"}
def test_results_diff(self, test_client): dataset1 = MatrixDataset(ma.masked_values([[1, 0], [0, 2]], 0)) dataset2 = MatrixDataset(ma.masked_values([[1, 2], [0, 2]], 0)) request_data1 = {"algorithm": "voting", "matrix": dataset1.to_csv()} resp1 = test_client.get("/run/", query_string=request_data1) assert resp1.status_code == 200 request_data2 = { # Test diffs with multiple algorithms "algorithm": ["voting", "investment"], "matrix": dataset2.to_csv(), "previous_results": json.dumps(resp1.json["data"]["voting"]) } resp2 = test_client.get("/run/", query_string=request_data2) assert resp2.status_code == 200 vot_out = resp2.json["data"]["voting"] assert "diff" in vot_out assert vot_out["diff"]["trust"] == {"0": 0, "1": 0} # no trust changes assert vot_out["diff"]["belief"] == { "0": { "1.0": -0.5 }, "1": { "2.0": 0 } } inv_out = resp2.json["data"]["investment"] assert "diff" in inv_out assert inv_out["diff"]["trust"] == {"0": -0.7731216864273125, "1": 0.0} assert inv_out["diff"]["belief"] == { "0": { "1.0": -0.9354768146506873 }, "1": { "2.0": 0.0 } }
def run(self): """ Run an algorithm on a user-supplied dataset. Required HTTP parameters: * 'algorithm' * 'matrix' Optional parameters: * 'parameters' * 'previous_results' Responses are JSON objects of the form ``{"ok": True, "data": ...}`` or ``{"ok": False, "error": ...}`` """ alg_labels = request.args.getlist("algorithm") matrix_csv = request.args.get("matrix") if not alg_labels or not matrix_csv: err_msg = "'algorithm' and 'matrix' parameters are required" return jsonify(ok=False, error=err_msg), 400 matrix_csv = matrix_csv.replace("_", "") params_str = request.args.get("parameters") try: all_params = self.get_param_dict(params_str) dataset = MatrixDataset.from_csv(StringIO(matrix_csv)) except ValueError as ex: return jsonify(ok=False, error=str(ex)), 400 messages = [] all_output = {} for alg_label in alg_labels: try: alg_cls = self.algorithm_cls(alg_label) params, ignored = self.get_algorithm_params( alg_cls, all_params) alg = self.get_algorithm_object(alg_cls, params) except ValueError as ex: return jsonify(ok=False, error=str(ex)), 400 # Show a message for each of the ignored parameters if ignored: msg = self.get_ignored_parameters_message(alg_cls, ignored) messages.append(msg) try: results = alg.run(dataset) except ConvergenceError as ex: return jsonify(ok=False, error=str(ex)), 500 except EmptyDatasetError as ex: return jsonify(ok=False, error=str(ex)), 400 output = self.get_output_obj(results) # Construct a graph and/or animation output["imagery"] = {} cs = ResultsGradientColourScheme(results) renderer = self.get_graph_renderer(colours=cs) json_buffer = StringIO() renderer.render(dataset, json_buffer) output["imagery"]["graph"] = json_buffer.getvalue() # Note: can only produce animation for iterative algorithms if isinstance(alg, BaseIterativeAlgorithm): animator = JsonAnimator(renderer=self.get_graph_renderer()) json_buffer = StringIO() # Note: empty data and convergence error would already have # been caught above, so no need to check here animator.animate(json_buffer, alg, dataset, show_progress=False) output["imagery"]["animation"] = json_buffer.getvalue() # Include diff between previous results if available prev_results = request.args.get("previous_results") if prev_results is not None: try: obj = self.get_results_object(prev_results) except ValueError as ex: err_msg = "'previous_results' is invalid: {}".format(ex) return jsonify(ok=False, error=err_msg), 400 # Previous results have been converted to JSON, which may have # changed numeric keys to strings: to ensure results can be # compared, convert the current results to and from JSON current_results = self.get_results_object(json.dumps(output)) diff = ResultDiff(obj, current_results) output["diff"] = { "trust": diff.trust, "belief": diff.belief, "time_taken": diff.time_taken, "iterations": diff.iterations } all_output[alg_label] = output return jsonify({"ok": True, "data": all_output, "messages": messages})
def dataset(self): return MatrixDataset( ma.masked_values( [[1, 2, 3, 2], [3, 0, 1, 2], [2, 2, 0, 0], [0, 1, 0, 3]], 0))
def data(self): data_path = self.get_filepath("data.csv") with open(data_path) as csv_file: return MatrixDataset.from_csv(csv_file)
out_path = sys.argv[1] except IndexError: print("usage: {} DEST".format(sys.argv[0]), file=sys.stderr) sys.exit(1) # tuples = [ # ("source 1", "x", 4), # ("source 1", "y", 7), # ("source 2", "y", 7), # ("source 2", "z", 5), # ("source 3", "x", 3), # ("source 3", "z", 5), # ("source 4", "x", 3), # ("source 4", "y", 6), # ("source 4", "z", 8) # ] # mydata = Dataset(tuples) mydata = MatrixDataset( ma.masked_values( [[1, 9, 3, 4], [2, 2, 9, 9], [9, 9, 7, 9], [1, 2, 5, 9]], 9)) it = ConvergenceIterator(DistanceMeasures.L2, 0.001) algorithm = Investment(iterator=it) cs = ResultsGradientColourScheme(algorithm.run(mydata)) rend = MatrixDatasetGraphRenderer(zero_indexed=False, colours=cs) animator = GifAnimator(renderer=rend, frame_duration=0.2) with open(out_path, "wb") as outfile: animator.animate(outfile, algorithm, mydata)
def test_num_sources_variables_claims(self): mat = MatrixDataset( np.array([[1, 4, 5], [2, 0, 5], [1, 1, 5], [3, 2, 5]])) assert mat.num_sources == 4 assert mat.num_variables == 3 assert mat.num_claims == 8
def test_dimension(self): arr = np.zeros((3, 3, 3)) with pytest.raises(ValueError): MatrixDataset(arr)
def test_invalid_csv_shape(self, tmpdir): filepath = tmpdir.join("data.csv") filepath.write("\n".join(["1,2,", "1,2"])) with pytest.raises(ValueError) as excinfo: MatrixDataset.from_csv(filepath.open()) assert "Expected 3 entries in row 2, got 2" in str(excinfo.value)