def learn_generative(y_data): """ Uses Snorkel to learn a generative model of the relative accuracies of LFs. It learns one generative model for each class, and combines them into a set of noisy labels """ labels = [[], [], [], [], [], [], [], [], [], [], [], [], []] for ex in y_data: for i in range(0, 13): label_i = [] for vote in ex: label_i.append(int(vote[i])) labels[i].append(np.array(label_i)) labels = map(lambda x: np.array(x), labels) labels = np.array(labels) n_labels = [] n_stats = [] for i, class_lbl in enumerate(labels): print("learning generative model for label: {}".format(i)) session = SnorkelSession() gen_model = GenerativeModel() gen_model.train(class_lbl, epochs=100, decay=0.95, step_size=0.1 / class_lbl.shape[0], reg_param=1e-6, cardinality=2) train_marginals = gen_model.marginals(csr_matrix(class_lbl)) n_labels.append(train_marginals) n_stats.append(gen_model.learned_lf_stats()) for i, stats in enumerate(n_stats): stats.to_csv("./results/lf_stats/" + int_to_label[i], sep=',', encoding='utf-8') return np.array(n_labels).T
def test_supervised(self): # A set of true priors tol = 0.1 LF_acc_priors = [0.75, 0.75, 0.75, 0.75, 0.9] cardinality = 2 LF_acc_prior_weights = [ 0.5 * np.log((cardinality - 1.0) * x / (1 - x)) for x in LF_acc_priors ] label_prior = 1 # Defines a label matrix n = 10000 L = sparse.lil_matrix((n, 5), dtype=np.int64) # Store the supervised gold labels separately labels = np.zeros(n, np.int64) for i in range(n): y = 2 * random.randint(0, 1) - 1 # First four LFs always vote, and have decent acc L[i, 0] = y * (2 * (random.random() < LF_acc_priors[0]) - 1) L[i, 1] = y * (2 * (random.random() < LF_acc_priors[1]) - 1) L[i, 2] = y * (2 * (random.random() < LF_acc_priors[2]) - 1) L[i, 3] = y * (2 * (random.random() < LF_acc_priors[3]) - 1) # The fifth LF is very accurate but has a much smaller coverage if random.random() < 0.2: L[i, 4] = y * (2 * (random.random() < LF_acc_priors[4]) - 1) # The sixth LF is a small supervised set if random.random() < 0.1: labels[i] = y # Test with priors -- first check init vals are correct print("Testing init:") gen_model = GenerativeModel(lf_propensity=True) gen_model.train(L, LF_acc_prior_weights=LF_acc_prior_weights, labels=labels, reg_type=2, reg_param=1, epochs=0) stats = gen_model.learned_lf_stats() accs = stats["Accuracy"] print(accs) print(gen_model.weights.lf_propensity) priors = np.array(LF_acc_priors + [label_prior]) self.assertTrue(np.all(np.abs(accs - priors) < tol)) # Now test that estimated LF accs are not too far off print("\nTesting estimated LF accs (TOL=%s)" % tol) gen_model.train( L, LF_acc_prior_weights=LF_acc_prior_weights, labels=labels, reg_type=0, reg_param=0.0, ) stats = gen_model.learned_lf_stats() accs = stats["Accuracy"] coverage = stats["Coverage"] print(accs) print(coverage) priors = np.array(LF_acc_priors + [label_prior]) self.assertTrue(np.all(np.abs(accs - priors) < tol)) self.assertTrue( np.all(np.abs(coverage - np.array([1, 1, 1, 1, 0.2, 0.1]) < tol))) # Test without supervised print("\nTesting without supervised") gen_model = GenerativeModel(lf_propensity=True) gen_model.train(L, reg_type=0) stats = gen_model.learned_lf_stats() accs = stats["Accuracy"] coverage = stats["Coverage"] print(accs) print(coverage) priors = np.array(LF_acc_priors) self.assertTrue(np.all(np.abs(accs - priors) < tol)) self.assertTrue( np.all(np.abs(coverage - np.array([1, 1, 1, 1, 0.2]) < tol))) # Test with supervised print("\nTesting with supervised, without priors") gen_model = GenerativeModel(lf_propensity=True) gen_model.train(L, labels=labels, reg_type=0) stats = gen_model.learned_lf_stats() accs = stats["Accuracy"] coverage = stats["Coverage"] print(accs) print(coverage) priors = np.array(LF_acc_priors + [label_prior]) self.assertTrue(np.all(np.abs(accs - priors) < tol)) self.assertTrue( np.all(np.abs(coverage - np.array([1, 1, 1, 1, 0.2, 0.1]) < tol))) # Test without supervised, and (intentionally) bad priors, but weak strength print("\nTesting without supervised, with bad priors (weak)") gen_model = GenerativeModel(lf_propensity=True) bad_prior = [0.9, 0.8, 0.7, 0.6, 0.5] bad_prior_weights = [ 0.5 * np.log((cardinality - 1.0) * x / (1 - x)) for x in bad_prior ] gen_model.train( L, LF_acc_prior_weights=bad_prior_weights, reg_type=0, ) stats = gen_model.learned_lf_stats() accs = stats["Accuracy"] coverage = stats["Coverage"] print(accs) print(coverage) priors = np.array(LF_acc_priors) self.assertTrue(np.all(np.abs(accs - priors) < tol)) # Test without supervised, and (intentionally) bad priors print("\nTesting without supervised, with bad priors (strong)") gen_model = GenerativeModel(lf_propensity=True) gen_model.train( L, LF_acc_prior_weights=bad_prior_weights, reg_type=2, reg_param=100 * n, ) stats = gen_model.learned_lf_stats() accs = stats["Accuracy"] coverage = stats["Coverage"] print(accs) self.assertTrue(np.all(np.abs(accs - np.array(bad_prior)) < tol))
def test_compile_no_deps(self): # Defines a label matrix L = sparse.lil_matrix((5, 3)) # The first LF always says yes L[0, 0] = 1 L[1, 0] = 1 L[2, 0] = 1 L[3, 0] = 1 L[4, 0] = 1 # The second LF votes differently L[0, 1] = 1 L[2, 1] = -1 L[4, 1] = 1 # The third LF always abstains # Tests compilation gen_model = GenerativeModel(class_prior=True, lf_prior=False, lf_propensity=False, lf_class_propensity=False) gen_model._process_dependency_graph(L, ()) m, n = L.shape LF_acc_prior_weights = [1.0 for _ in range(n)] is_fixed = [False for _ in range(n)] gen_model.cardinality = 2 cardinalities = 2 * np.ones(5) weight, variable, factor, ftv, domain_mask, n_edges =\ gen_model._compile(L, 0.5, 0.0, LF_acc_prior_weights, is_fixed, cardinalities) # # Weights # # Should now be 3 for LFs + 3 (fixed) for LF priors + 1 class prior self.assertEqual(len(weight), 7) self.assertFalse(weight[0]['isFixed']) self.assertEqual(weight[0]['initialValue'], 0.0) # The LF priors for i in range(1, 7, 2): self.assertTrue(weight[i]['isFixed']) self.assertEqual(weight[i]['initialValue'], 1.0) # The LF weights for i in range(2, 7, 2): self.assertFalse(weight[i]['isFixed']) self.assertEqual(weight[i]['initialValue'], 0.0) # # Variables # self.assertEqual(len(variable), 20) for i in range(5): self.assertEqual(variable[i]['isEvidence'], 0) self.assertTrue(variable[i]['initialValue'] == 0 or variable[i]['initialValue'] == 1) self.assertEqual(variable[i]["dataType"], 0) self.assertEqual(variable[i]["cardinality"], 2) for i in range(5): for j in range(3): self.assertEqual(variable[5 + i * 3 + j]['isEvidence'], 1) # Remap label value; abstain is 0 in L, cardinality (= 2) in NS if L[i, j] == -1: l = 0 elif L[i, j] == 0: l = 2 elif L[i, j] == 1: l = 1 self.assertEqual(variable[5 + i * 3 + j]['initialValue'], l) self.assertEqual(variable[5 + i * 3 + j]["dataType"], 0) self.assertEqual(variable[5 + i * 3 + j]["cardinality"], 3) # # Factors # # 5 * 3 LF acc factors + 5 * 3 LF prior factors + 5 class prior factors self.assertEqual(len(factor), 35) for i in range(5): self.assertEqual(factor[i]["factorFunction"], FACTORS["DP_GEN_CLASS_PRIOR"]) self.assertEqual(factor[i]["weightId"], 0) self.assertEqual(factor[i]["featureValue"], 1) self.assertEqual(factor[i]["arity"], 1) self.assertEqual(factor[i]["ftv_offset"], i) for i in range(5): for j in range(6): self.assertEqual(factor[5 + i * 6 + j]["factorFunction"], FACTORS["DP_GEN_LF_ACCURACY"]) self.assertEqual(factor[5 + i * 6 + j]["weightId"], j + 1) self.assertEqual(factor[5 + i * 6 + j]["featureValue"], 1) self.assertEqual(factor[5 + i * 6 + j]["arity"], 2) self.assertEqual(factor[5 + i * 6 + j]["ftv_offset"], 5 + 2 * (i * 6 + j)) # # Factor to Var # self.assertEqual(len(ftv), 65) # Class prior factor - var edges for i in range(5): self.assertEqual(ftv[i]["vid"], i) self.assertEqual(ftv[i]["dense_equal_to"], 0) # LF *and LF prior* factor - var edges for i in range(5): for j in range(3): # Each LF has one weight factor and one prior factor here for k in range(2): idx = 4 * (i * 3 + j) + 2 * k self.assertEqual(ftv[5 + idx]["vid"], i) self.assertEqual(ftv[6 + idx]["vid"], 5 + i * 3 + j) self.assertEqual(ftv[5 + idx]["dense_equal_to"], 0) self.assertEqual(ftv[6 + idx]["dense_equal_to"], 0) # # Domain mask # self.assertEqual(len(domain_mask), 20) for i in range(20): self.assertFalse(domain_mask[i]) # n_edges self.assertEqual(n_edges, 65)
def test_compile_with_deps(self): # Defines a label matrix L = sparse.lil_matrix((5, 3)) # The first LF always says yes L[0, 0] = 1 L[1, 0] = 1 L[2, 0] = 1 L[3, 0] = 1 L[4, 0] = 1 # The second LF votes differently L[0, 1] = 1 L[2, 1] = -1 L[4, 1] = 1 # The third LF always abstains # Defined dependencies deps = [] deps.append((0, 1, DEP_SIMILAR)) deps.append((0, 2, DEP_SIMILAR)) deps.append((0, 1, DEP_FIXING)) deps.append((0, 2, DEP_REINFORCING)) deps.append((1, 2, DEP_EXCLUSIVE)) # Tests compilation gen_model = GenerativeModel(class_prior=False, lf_prior=False, lf_propensity=True, lf_class_propensity=False) gen_model._process_dependency_graph(L, deps) m, n = L.shape LF_acc_prior_weights = [1.0 for _ in range(n)] is_fixed = [False for _ in range(n)] gen_model.cardinality = 2 cardinalities = 2 * np.ones(5) weight, variable, factor, ftv, domain_mask, n_edges =\ gen_model._compile(L, 0.5, -1.0, LF_acc_prior_weights, is_fixed, cardinalities) # # Weights # # Should now be 3 for LFs + 3 fixed for LF priors + 3 for LF propensity # + 5 for deps self.assertEqual(len(weight), 14) # The LF priors for i in range(0, 6, 2): self.assertTrue(weight[i]['isFixed']) self.assertEqual(weight[i]['initialValue'], 1.0) # The LF weights for i in range(1, 6, 2): self.assertFalse(weight[i]['isFixed']) self.assertEqual(weight[i]['initialValue'], 0.0) # The dep weights for i in range(6, 14): self.assertFalse(weight[i]['isFixed']) self.assertEqual(weight[i]['initialValue'], 0.5) # # Variables # self.assertEqual(len(variable), 20) for i in range(5): self.assertEqual(variable[i]['isEvidence'], 0) self.assertTrue(variable[i]['initialValue'] == 0 or variable[i]['initialValue'] == 1) self.assertEqual(variable[i]["dataType"], 0) self.assertEqual(variable[i]["cardinality"], 2) for i in range(5): for j in range(3): self.assertEqual(variable[5 + i * 3 + j]['isEvidence'], 1) # Remap label value; abstain is 0 in L, cardinality (= 2) in NS if L[i, j] == -1: l = 0 elif L[i, j] == 0: l = 2 elif L[i, j] == 1: l = 1 self.assertEqual(variable[5 + i * 3 + j]['initialValue'], l) self.assertEqual(variable[5 + i * 3 + j]["dataType"], 0) self.assertEqual(variable[5 + i * 3 + j]["cardinality"], 3) # # Factors # self.assertEqual(len(factor), 70) f_offset = 0 ftv_offset = 0 for i in range(5): for j in range(6): self.assertEqual( factor[f_offset + i * 6 + j]["factorFunction"], FACTORS["DP_GEN_LF_ACCURACY"]) self.assertEqual(factor[f_offset + i * 6 + j]["weightId"], j) self.assertEqual(factor[f_offset + i * 6 + j]["featureValue"], 1) self.assertEqual(factor[f_offset + i * 6 + j]["arity"], 2) self.assertEqual(factor[f_offset + i * 6 + j]["ftv_offset"], ftv_offset + 2 * (i * 6 + j)) f_offset = 30 ftv_offset = 60 for i in range(5): for j in range(3): self.assertEqual( factor[f_offset + i * 3 + j]["factorFunction"], FACTORS["DP_GEN_LF_PROPENSITY"]) self.assertEqual(factor[f_offset + i * 3 + j]["weightId"], 6 + j) self.assertEqual(factor[f_offset + i * 3 + j]["featureValue"], 1) self.assertEqual(factor[f_offset + i * 3 + j]["arity"], 1) self.assertEqual(factor[f_offset + i * 3 + j]["ftv_offset"], ftv_offset + (i * 3 + j)) f_offset = 45 ftv_offset = 75 for i in range(5): self.assertEqual(factor[f_offset + i]["factorFunction"], FACTORS["DP_GEN_DEP_SIMILAR"]) self.assertEqual(factor[f_offset + i]["weightId"], 9) self.assertEqual(factor[f_offset + i]["featureValue"], 1) self.assertEqual(factor[f_offset + i]["arity"], 2) self.assertEqual(factor[f_offset + i]["ftv_offset"], ftv_offset + 2 * i) f_offset = 50 ftv_offset = 85 for i in range(5): self.assertEqual(factor[f_offset + i]["factorFunction"], FACTORS["DP_GEN_DEP_SIMILAR"]) self.assertEqual(factor[f_offset + i]["weightId"], 10) self.assertEqual(factor[f_offset + i]["featureValue"], 1) self.assertEqual(factor[f_offset + i]["arity"], 2) self.assertEqual(factor[f_offset + i]["ftv_offset"], ftv_offset + 2 * i) f_offset = 55 ftv_offset = 95 for i in range(5): self.assertEqual(factor[f_offset + i]["factorFunction"], FACTORS["DP_GEN_DEP_FIXING"]) self.assertEqual(factor[f_offset + i]["weightId"], 11) self.assertEqual(factor[f_offset + i]["featureValue"], 1) self.assertEqual(factor[f_offset + i]["arity"], 3) self.assertEqual(factor[f_offset + i]["ftv_offset"], ftv_offset + 3 * i) f_offset = 60 ftv_offset = 110 for i in range(5): self.assertEqual(factor[f_offset + i]["factorFunction"], FACTORS["DP_GEN_DEP_REINFORCING"]) self.assertEqual(factor[f_offset + i]["weightId"], 12) self.assertEqual(factor[f_offset + i]["featureValue"], 1) self.assertEqual(factor[f_offset + i]["arity"], 3) self.assertEqual(factor[f_offset + i]["ftv_offset"], ftv_offset + 3 * i) f_offset = 65 ftv_offset = 125 for i in range(5): self.assertEqual(factor[f_offset + i]["factorFunction"], FACTORS["DP_GEN_DEP_EXCLUSIVE"]) self.assertEqual(factor[f_offset + i]["weightId"], 13) self.assertEqual(factor[f_offset + i]["featureValue"], 1) self.assertEqual(factor[f_offset + i]["arity"], 2) self.assertEqual(factor[f_offset + i]["ftv_offset"], ftv_offset + 2 * i) # # Factor to Var # self.assertEqual(len(ftv), 135) ftv_offset = 0 for i in range(5): for j in range(3): for k in range(2): self.assertEqual( ftv[ftv_offset + 4 * (i * 3 + j) + 2 * k]["vid"], i) self.assertEqual( ftv[ftv_offset + 4 * (i * 3 + j) + 2 * k]["dense_equal_to"], 0) self.assertEqual( ftv[ftv_offset + 4 * (i * 3 + j) + 2 * k + 1]["vid"], 5 + i * 3 + j) self.assertEqual( ftv[ftv_offset + 4 * (i * 3 + j) + 2 * k + 1]["dense_equal_to"], 0) ftv_offset = 60 for i in range(5): for j in range(3): self.assertEqual(ftv[ftv_offset + (i * 3 + j)]["vid"], 5 + i * 3 + j) self.assertEqual( ftv[ftv_offset + (i * 3 + j)]["dense_equal_to"], 0) ftv_offset = 75 for i in range(5): self.assertEqual(ftv[ftv_offset + 2 * i]["vid"], 5 + i * 3) self.assertEqual(ftv[ftv_offset + 2 * i]["dense_equal_to"], 0) self.assertEqual(ftv[ftv_offset + 2 * i + 1]["vid"], 5 + i * 3 + 1) self.assertEqual(ftv[ftv_offset + 2 * i + 1]["dense_equal_to"], 0) ftv_offset = 85 for i in range(5): self.assertEqual(ftv[ftv_offset + 2 * i]["vid"], 5 + i * 3) self.assertEqual(ftv[ftv_offset + 2 * i]["dense_equal_to"], 0) self.assertEqual(ftv[ftv_offset + 2 * i + 1]["vid"], 5 + i * 3 + 2) self.assertEqual(ftv[ftv_offset + 2 * i + 1]["dense_equal_to"], 0) ftv_offset = 95 for i in range(5): self.assertEqual(ftv[ftv_offset + 3 * i]["vid"], i) self.assertEqual(ftv[ftv_offset + 3 * i]["dense_equal_to"], 0) self.assertEqual(ftv[ftv_offset + 3 * i + 1]["vid"], 5 + i * 3) self.assertEqual(ftv[ftv_offset + 3 * i + 1]["dense_equal_to"], 0) self.assertEqual(ftv[ftv_offset + 3 * i + 2]["vid"], 5 + i * 3 + 1) self.assertEqual(ftv[ftv_offset + 3 * i + 2]["dense_equal_to"], 0) ftv_offset = 110 for i in range(5): self.assertEqual(ftv[ftv_offset + 3 * i]["vid"], i) self.assertEqual(ftv[ftv_offset + 3 * i]["dense_equal_to"], 0) self.assertEqual(ftv[ftv_offset + 3 * i + 1]["vid"], 5 + i * 3) self.assertEqual(ftv[ftv_offset + 3 * i + 1]["dense_equal_to"], 0) self.assertEqual(ftv[ftv_offset + 3 * i + 2]["vid"], 5 + i * 3 + 2) self.assertEqual(ftv[ftv_offset + 3 * i + 2]["dense_equal_to"], 0) ftv_offset = 125 for i in range(5): self.assertEqual(ftv[ftv_offset + 2 * i]["vid"], 5 + i * 3 + 1) self.assertEqual(ftv[ftv_offset + 2 * i]["dense_equal_to"], 0) self.assertEqual(ftv[ftv_offset + 2 * i + 1]["vid"], 5 + i * 3 + 2) self.assertEqual(ftv[ftv_offset + 2 * i + 1]["dense_equal_to"], 0) # # Domain mask # self.assertEqual(len(domain_mask), 20) for i in range(20): self.assertFalse(domain_mask[i]) # n_edges self.assertEqual(n_edges, 135)
print("Commit to snorkel database done...") #writing label generator def worker_label_generator(t): for worker_id in cand_dict[t.tweet.stable_id]: yield worker_id, cand_dict[t.tweet.stable_id][worker_id] np.random.seed(1701) labeler = LabelAnnotator(label_generator=worker_label_generator) L_train = labeler.apply(split=0) print(L_train.lf_stats(session)) print("Creat training data done...") print(" -train data shape", (L_train.shape)) print("Start to train a generative model") gen_model = GenerativeModel(lf_propensity=True) gen_model.train(L_train, reg_type=2, reg_param=0.1, epochs=30) #doing statistics print(gen_model.learned_lf_stats()) print("Train a genetive model done...!") train_marginals = gen_model.marginals(L_train) print("Number of examples:", len(train_marginals)) print(train_marginals)
def _test_categorical(self, L, LF_acc_priors, labels, label_prior=1, candidate_ranges=None, cardinality=4, tol=0.1, n=10000): """Run a suite of tests.""" # Map to log scale weights LF_acc_prior_weights = [0.5 * np.log((cardinality - 1.0) * x / (1 - x)) for x in LF_acc_priors] # Test with priors -- first check init vals are correct print("Testing init:") t0 = time() gen_model = GenerativeModel(lf_propensity=True) gen_model.train( L, LF_acc_prior_weights=LF_acc_prior_weights, labels=labels, reg_type=2, reg_param=1, epochs=0, candidate_ranges=candidate_ranges ) stats = gen_model.learned_lf_stats() accs = stats["Accuracy"] print(accs) print(gen_model.weights.lf_propensity) priors = np.array(LF_acc_priors + [label_prior]) self.assertTrue(np.all(np.abs(accs - priors) < tol)) print("Finished in {0} sec.".format(time()-t0)) # Now test that estimated LF accs are not too far off print("\nTesting estimated LF accs (TOL=%s)" % tol) t0 = time() gen_model.train( L, LF_acc_prior_weights=LF_acc_prior_weights, labels=labels, reg_type=0, reg_param=0.0, candidate_ranges=candidate_ranges ) stats = gen_model.learned_lf_stats() accs = stats["Accuracy"] coverage = stats["Coverage"] print(accs) print(coverage) priors = np.array(LF_acc_priors + [label_prior]) self.assertTrue(np.all(np.abs(accs - priors) < tol)) self.assertTrue(np.all(np.abs(coverage - np.array([1, 1, 1, 1, 0.2, 0.1]) < tol))) print("Finished in {0} sec.".format(time()-t0)) # Test without supervised print("\nTesting without supervised") t0 = time() gen_model = GenerativeModel(lf_propensity=True) gen_model.train(L, reg_type=0, candidate_ranges=candidate_ranges) stats = gen_model.learned_lf_stats() accs = stats["Accuracy"] coverage = stats["Coverage"] print(accs) print(coverage) priors = np.array(LF_acc_priors) self.assertTrue(np.all(np.abs(accs - priors) < tol)) self.assertTrue(np.all(np.abs(coverage - np.array([1, 1, 1, 1, 0.2]) < tol))) print("Finished in {0} sec.".format(time()-t0)) # Test with supervised print("\nTesting with supervised, without priors") t0 = time() gen_model = GenerativeModel(lf_propensity=True) gen_model.train( L, labels=labels, reg_type=0, candidate_ranges=candidate_ranges ) stats = gen_model.learned_lf_stats() accs = stats["Accuracy"] coverage = stats["Coverage"] print(accs) print(coverage) priors = np.array(LF_acc_priors + [label_prior]) self.assertTrue(np.all(np.abs(accs - priors) < tol)) self.assertTrue(np.all(np.abs(coverage - np.array([1, 1, 1, 1, 0.2, 0.1]) < tol))) print("Finished in {0} sec.".format(time()-t0)) # Test without supervised, and (intentionally) bad priors, but weak strength print("\nTesting without supervised, with bad priors (weak)") t0 = time() gen_model = GenerativeModel(lf_propensity=True) bad_prior = [0.9, 0.8, 0.7, 0.6, 0.5] bad_prior_weights = [0.5 * np.log((cardinality - 1.0) * x / (1 - x)) for x in bad_prior] gen_model.train( L, LF_acc_prior_weights=bad_prior_weights, reg_type=0, candidate_ranges=candidate_ranges ) stats = gen_model.learned_lf_stats() accs = stats["Accuracy"] coverage = stats["Coverage"] print(accs) print(coverage) priors = np.array(LF_acc_priors) self.assertTrue(np.all(np.abs(accs - priors) < tol)) print("Finished in {0} sec.".format(time()-t0)) # Test without supervised, and (intentionally) bad priors print("\nTesting without supervised, with bad priors (strong)") t0 = time() gen_model = GenerativeModel(lf_propensity=True) gen_model.train( L, LF_acc_prior_weights=bad_prior_weights, reg_type=2, reg_param=100 * n, candidate_ranges=candidate_ranges ) stats = gen_model.learned_lf_stats() accs = stats["Accuracy"] coverage = stats["Coverage"] print(accs) self.assertTrue(np.all(np.abs(accs - np.array(bad_prior)) < tol)) print("Finished in {0} sec.".format(time()-t0))
def test_supervised(self): # A set of true priors tol = 0.1 LF_acc_priors = [0.75, 0.75, 0.75, 0.75, 0.9] cardinality = 2 LF_acc_prior_weights = [0.5 * np.log((cardinality - 1.0) * x / (1 - x)) for x in LF_acc_priors] label_prior = 1 # Defines a label matrix n = 10000 L = sparse.lil_matrix((n, 5), dtype=np.int64) # Store the supervised gold labels separately labels = np.zeros(n, np.int64) for i in range(n): y = 2 * random.randint(0, 1) - 1 # First four LFs always vote, and have decent acc L[i, 0] = y * (2 * (random.random() < LF_acc_priors[0]) - 1) L[i, 1] = y * (2 * (random.random() < LF_acc_priors[1]) - 1) L[i, 2] = y * (2 * (random.random() < LF_acc_priors[2]) - 1) L[i, 3] = y * (2 * (random.random() < LF_acc_priors[3]) - 1) # The fifth LF is very accurate but has a much smaller coverage if random.random() < 0.2: L[i, 4] = y * (2 * (random.random() < LF_acc_priors[4]) - 1) # The sixth LF is a small supervised set if random.random() < 0.1: labels[i] = y # Test with priors -- first check init vals are correct print("Testing init:") gen_model = GenerativeModel(lf_propensity=True) gen_model.train( L, LF_acc_prior_weights=LF_acc_prior_weights, labels=labels, reg_type=2, reg_param=1, epochs=0 ) stats = gen_model.learned_lf_stats() accs = stats["Accuracy"] print(accs) print(gen_model.weights.lf_propensity) priors = np.array(LF_acc_priors + [label_prior]) self.assertTrue(np.all(np.abs(accs - priors) < tol)) # Now test that estimated LF accs are not too far off print("\nTesting estimated LF accs (TOL=%s)" % tol) gen_model.train( L, LF_acc_prior_weights=LF_acc_prior_weights, labels=labels, reg_type=0, reg_param=0.0, ) stats = gen_model.learned_lf_stats() accs = stats["Accuracy"] coverage = stats["Coverage"] print(accs) print(coverage) priors = np.array(LF_acc_priors + [label_prior]) self.assertTrue(np.all(np.abs(accs - priors) < tol)) self.assertTrue(np.all(np.abs(coverage - np.array([1, 1, 1, 1, 0.2, 0.1]) < tol))) # Test without supervised print("\nTesting without supervised") gen_model = GenerativeModel(lf_propensity=True) gen_model.train(L, reg_type=0) stats = gen_model.learned_lf_stats() accs = stats["Accuracy"] coverage = stats["Coverage"] print(accs) print(coverage) priors = np.array(LF_acc_priors) self.assertTrue(np.all(np.abs(accs - priors) < tol)) self.assertTrue(np.all(np.abs(coverage - np.array([1, 1, 1, 1, 0.2]) < tol))) # Test with supervised print("\nTesting with supervised, without priors") gen_model = GenerativeModel(lf_propensity=True) gen_model.train( L, labels=labels, reg_type=0 ) stats = gen_model.learned_lf_stats() accs = stats["Accuracy"] coverage = stats["Coverage"] print(accs) print(coverage) priors = np.array(LF_acc_priors + [label_prior]) self.assertTrue(np.all(np.abs(accs - priors) < tol)) self.assertTrue(np.all(np.abs(coverage - np.array([1, 1, 1, 1, 0.2, 0.1]) < tol))) # Test without supervised, and (intentionally) bad priors, but weak strength print("\nTesting without supervised, with bad priors (weak)") gen_model = GenerativeModel(lf_propensity=True) bad_prior = [0.9, 0.8, 0.7, 0.6, 0.5] bad_prior_weights = [0.5 * np.log((cardinality - 1.0) * x / (1 - x)) for x in bad_prior] gen_model.train( L, LF_acc_prior_weights=bad_prior_weights, reg_type=0, ) stats = gen_model.learned_lf_stats() accs = stats["Accuracy"] coverage = stats["Coverage"] print(accs) print(coverage) priors = np.array(LF_acc_priors) self.assertTrue(np.all(np.abs(accs - priors) < tol)) # Test without supervised, and (intentionally) bad priors print("\nTesting without supervised, with bad priors (strong)") gen_model = GenerativeModel(lf_propensity=True) gen_model.train( L, LF_acc_prior_weights=bad_prior_weights, reg_type=2, reg_param=100 * n, ) stats = gen_model.learned_lf_stats() accs = stats["Accuracy"] coverage = stats["Coverage"] print(accs) self.assertTrue(np.all(np.abs(accs - np.array(bad_prior)) < tol))
def test_compile_no_deps(self): # Defines a label matrix L = sparse.lil_matrix((5, 3)) # The first LF always says yes L[0, 0] = 1 L[1, 0] = 1 L[2, 0] = 1 L[3, 0] = 1 L[4, 0] = 1 # The second LF votes differently L[0, 1] = 1 L[2, 1] = -1 L[4, 1] = 1 # The third LF always abstains # Tests compilation gen_model = GenerativeModel(class_prior=True, lf_prior=False, lf_propensity=False, lf_class_propensity=False) gen_model._process_dependency_graph(L, ()) weight, variable, factor, ftv, domain_mask, n_edges = gen_model._compile( L, None, 1.0) # # Weights # self.assertEqual(len(weight), 4) self.assertFalse(weight[0]['isFixed']) self.assertEqual(weight[0]['initialValue'], 0.0) for i in range(1, 4): self.assertFalse(weight[i]['isFixed']) self.assertTrue(0.9 <= weight[i]['initialValue'] <= 1.1) # # Variables # self.assertEqual(len(variable), 20) for i in range(5): self.assertEqual(variable[i]['isEvidence'], 0) self.assertTrue(variable[i]['initialValue'] == 0 or variable[i]['initialValue'] == 1) self.assertEqual(variable[i]["dataType"], 0) self.assertEqual(variable[i]["cardinality"], 2) for i in range(5): for j in range(3): self.assertEqual(variable[5 + i * 3 + j]['isEvidence'], 1) self.assertEqual(variable[5 + i * 3 + j]['initialValue'], L[i, j] + 1) self.assertEqual(variable[5 + i * 3 + j]["dataType"], 0) self.assertEqual(variable[5 + i * 3 + j]["cardinality"], 3) # # Factors # self.assertEqual(len(factor), 20) for i in range(5): self.assertEqual(factor[i]["factorFunction"], FACTORS["DP_GEN_CLASS_PRIOR"]) self.assertEqual(factor[i]["weightId"], 0) self.assertEqual(factor[i]["featureValue"], 1) self.assertEqual(factor[i]["arity"], 1) self.assertEqual(factor[i]["ftv_offset"], i) for i in range(5): for j in range(3): self.assertEqual(factor[5 + i * 3 + j]["factorFunction"], FACTORS["DP_GEN_LF_ACCURACY"]) self.assertEqual(factor[5 + i * 3 + j]["weightId"], j + 1) self.assertEqual(factor[5 + i * 3 + j]["featureValue"], 1) self.assertEqual(factor[5 + i * 3 + j]["arity"], 2) self.assertEqual(factor[5 + i * 3 + j]["ftv_offset"], 5 + 2 * (i * 3 + j)) # # Factor to Var # self.assertEqual(len(ftv), 35) for i in range(5): self.assertEqual(ftv[i]["vid"], i) self.assertEqual(ftv[i]["dense_equal_to"], 0) for i in range(5): for j in range(3): self.assertEqual(ftv[5 + 2 * (i * 3 + j)]["vid"], i) self.assertEqual(ftv[6 + 2 * (i * 3 + j)]["vid"], 5 + i * 3 + j) self.assertEqual(ftv[5 + 2 * (i * 3 + j)]["dense_equal_to"], 0) self.assertEqual(ftv[6 + 2 * (i * 3 + j)]["dense_equal_to"], 0) # # Domain mask # self.assertEqual(len(domain_mask), 20) for i in range(20): self.assertFalse(domain_mask[i]) # n_edges self.assertEqual(n_edges, 35)
def test_compile_with_deps(self): # Defines a label matrix L = sparse.lil_matrix((5, 3)) # The first LF always says yes L[0, 0] = 1 L[1, 0] = 1 L[2, 0] = 1 L[3, 0] = 1 L[4, 0] = 1 # The second LF votes differently L[0, 1] = 1 L[2, 1] = -1 L[4, 1] = 1 # The third LF always abstains # Defined dependencies deps = [] deps.append((0, 1, DEP_SIMILAR)) deps.append((0, 2, DEP_SIMILAR)) deps.append((0, 1, DEP_FIXING)) deps.append((0, 2, DEP_REINFORCING)) deps.append((1, 2, DEP_EXCLUSIVE)) # Tests compilation gen_model = GenerativeModel(class_prior=False, lf_prior=False, lf_propensity=True, lf_class_propensity=False) gen_model._process_dependency_graph(L, deps) weight, variable, factor, ftv, domain_mask, n_edges = gen_model._compile( L, None, 1.0) # # Weights # self.assertEqual(len(weight), 11) for i in range(3): self.assertFalse(weight[i]['isFixed']) self.assertTrue(0.9 <= weight[i]['initialValue'] <= 1.1) for i in range(3, 11): self.assertFalse(weight[i]['isFixed']) self.assertEqual(weight[i]['initialValue'], 0.0) # # Variables # self.assertEqual(len(variable), 20) for i in range(5): self.assertEqual(variable[i]['isEvidence'], 0) self.assertTrue(variable[i]['initialValue'] == 0 or variable[i]['initialValue'] == 1) self.assertEqual(variable[i]["dataType"], 0) self.assertEqual(variable[i]["cardinality"], 2) for i in range(5): for j in range(3): self.assertEqual(variable[5 + i * 3 + j]['isEvidence'], 1) self.assertEqual(variable[5 + i * 3 + j]['initialValue'], L[i, j] + 1) self.assertEqual(variable[5 + i * 3 + j]["dataType"], 0) self.assertEqual(variable[5 + i * 3 + j]["cardinality"], 3) # # Factors # self.assertEqual(len(factor), 55) f_offset = 0 ftv_offset = 0 for i in range(5): for j in range(3): self.assertEqual( factor[f_offset + i * 3 + j]["factorFunction"], FACTORS["DP_GEN_LF_ACCURACY"]) self.assertEqual(factor[f_offset + i * 3 + j]["weightId"], j) self.assertEqual(factor[f_offset + i * 3 + j]["featureValue"], 1) self.assertEqual(factor[f_offset + i * 3 + j]["arity"], 2) self.assertEqual(factor[f_offset + i * 3 + j]["ftv_offset"], ftv_offset + 2 * (i * 3 + j)) f_offset = 15 ftv_offset = 30 for i in range(5): for j in range(3): self.assertEqual( factor[f_offset + i * 3 + j]["factorFunction"], FACTORS["DP_GEN_LF_PROPENSITY"]) self.assertEqual(factor[f_offset + i * 3 + j]["weightId"], 3 + j) self.assertEqual(factor[f_offset + i * 3 + j]["featureValue"], 1) self.assertEqual(factor[f_offset + i * 3 + j]["arity"], 1) self.assertEqual(factor[f_offset + i * 3 + j]["ftv_offset"], ftv_offset + (i * 3 + j)) f_offset = 30 ftv_offset = 45 for i in range(5): self.assertEqual(factor[f_offset + i]["factorFunction"], FACTORS["EQUAL"]) self.assertEqual(factor[f_offset + i]["weightId"], 6) self.assertEqual(factor[f_offset + i]["featureValue"], 1) self.assertEqual(factor[f_offset + i]["arity"], 2) self.assertEqual(factor[f_offset + i]["ftv_offset"], ftv_offset + 2 * i) f_offset = 35 ftv_offset = 55 for i in range(5): self.assertEqual(factor[f_offset + i]["factorFunction"], FACTORS["EQUAL"]) self.assertEqual(factor[f_offset + i]["weightId"], 7) self.assertEqual(factor[f_offset + i]["featureValue"], 1) self.assertEqual(factor[f_offset + i]["arity"], 2) self.assertEqual(factor[f_offset + i]["ftv_offset"], ftv_offset + 2 * i) f_offset = 40 ftv_offset = 65 for i in range(5): self.assertEqual(factor[f_offset + i]["factorFunction"], FACTORS["DP_GEN_DEP_FIXING"]) self.assertEqual(factor[f_offset + i]["weightId"], 8) self.assertEqual(factor[f_offset + i]["featureValue"], 1) self.assertEqual(factor[f_offset + i]["arity"], 3) self.assertEqual(factor[f_offset + i]["ftv_offset"], ftv_offset + 3 * i) f_offset = 45 ftv_offset = 80 for i in range(5): self.assertEqual(factor[f_offset + i]["factorFunction"], FACTORS["DP_GEN_DEP_REINFORCING"]) self.assertEqual(factor[f_offset + i]["weightId"], 9) self.assertEqual(factor[f_offset + i]["featureValue"], 1) self.assertEqual(factor[f_offset + i]["arity"], 3) self.assertEqual(factor[f_offset + i]["ftv_offset"], ftv_offset + 3 * i) f_offset = 50 ftv_offset = 95 for i in range(5): self.assertEqual(factor[f_offset + i]["factorFunction"], FACTORS["DP_GEN_DEP_EXCLUSIVE"]) self.assertEqual(factor[f_offset + i]["weightId"], 10) self.assertEqual(factor[f_offset + i]["featureValue"], 1) self.assertEqual(factor[f_offset + i]["arity"], 2) self.assertEqual(factor[f_offset + i]["ftv_offset"], ftv_offset + 2 * i) # # Factor to Var # self.assertEqual(len(ftv), 105) ftv_offset = 0 for i in range(5): for j in range(3): self.assertEqual(ftv[ftv_offset + 2 * (i * 3 + j)]["vid"], i) self.assertEqual( ftv[ftv_offset + 2 * (i * 3 + j)]["dense_equal_to"], 0) self.assertEqual(ftv[ftv_offset + 2 * (i * 3 + j) + 1]["vid"], 5 + i * 3 + j) self.assertEqual( ftv[ftv_offset + 2 * (i * 3 + j) + 1]["dense_equal_to"], 0) ftv_offset = 30 for i in range(5): for j in range(3): self.assertEqual(ftv[ftv_offset + (i * 3 + j)]["vid"], 5 + i * 3 + j) self.assertEqual( ftv[ftv_offset + (i * 3 + j)]["dense_equal_to"], 0) ftv_offset = 45 for i in range(5): self.assertEqual(ftv[ftv_offset + 2 * i]["vid"], 5 + i * 3) self.assertEqual(ftv[ftv_offset + 2 * i]["dense_equal_to"], 0) self.assertEqual(ftv[ftv_offset + 2 * i + 1]["vid"], 5 + i * 3 + 1) self.assertEqual(ftv[ftv_offset + 2 * i + 1]["dense_equal_to"], 0) ftv_offset = 55 for i in range(5): self.assertEqual(ftv[ftv_offset + 2 * i]["vid"], 5 + i * 3) self.assertEqual(ftv[ftv_offset + 2 * i]["dense_equal_to"], 0) self.assertEqual(ftv[ftv_offset + 2 * i + 1]["vid"], 5 + i * 3 + 2) self.assertEqual(ftv[ftv_offset + 2 * i + 1]["dense_equal_to"], 0) ftv_offset = 65 for i in range(5): self.assertEqual(ftv[ftv_offset + 3 * i]["vid"], i) self.assertEqual(ftv[ftv_offset + 3 * i]["dense_equal_to"], 0) self.assertEqual(ftv[ftv_offset + 3 * i + 1]["vid"], 5 + i * 3) self.assertEqual(ftv[ftv_offset + 3 * i + 1]["dense_equal_to"], 0) self.assertEqual(ftv[ftv_offset + 3 * i + 2]["vid"], 5 + i * 3 + 1) self.assertEqual(ftv[ftv_offset + 3 * i + 2]["dense_equal_to"], 0) ftv_offset = 80 for i in range(5): self.assertEqual(ftv[ftv_offset + 3 * i]["vid"], i) self.assertEqual(ftv[ftv_offset + 3 * i]["dense_equal_to"], 0) self.assertEqual(ftv[ftv_offset + 3 * i + 1]["vid"], 5 + i * 3) self.assertEqual(ftv[ftv_offset + 3 * i + 1]["dense_equal_to"], 0) self.assertEqual(ftv[ftv_offset + 3 * i + 2]["vid"], 5 + i * 3 + 2) self.assertEqual(ftv[ftv_offset + 3 * i + 2]["dense_equal_to"], 0) ftv_offset = 95 for i in range(5): self.assertEqual(ftv[ftv_offset + 2 * i]["vid"], 5 + i * 3 + 1) self.assertEqual(ftv[ftv_offset + 2 * i]["dense_equal_to"], 0) self.assertEqual(ftv[ftv_offset + 2 * i + 1]["vid"], 5 + i * 3 + 2) self.assertEqual(ftv[ftv_offset + 2 * i + 1]["dense_equal_to"], 0) # # Domain mask # self.assertEqual(len(domain_mask), 20) for i in range(20): self.assertFalse(domain_mask[i]) # n_edges self.assertEqual(n_edges, 105)
def _test_categorical(self, L, LF_acc_priors, labels, label_prior=1, candidate_ranges=None, cardinality=4, tol=0.1, n=10000): """Run a suite of tests.""" # Map to log scale weights LF_acc_prior_weights = map( lambda x: 0.5 * np.log((cardinality - 1.0) * x / (1 - x)), LF_acc_priors) # Test with priors -- first check init vals are correct print("Testing init:") t0 = time() gen_model = GenerativeModel(lf_propensity=True) gen_model.train(L, LF_acc_prior_weights=LF_acc_prior_weights, labels=labels, reg_type=2, reg_param=1, epochs=0, candidate_ranges=candidate_ranges) stats = gen_model.learned_lf_stats() accs = stats["Accuracy"] print(accs) print(gen_model.weights.lf_propensity) priors = np.array(LF_acc_priors + [label_prior]) self.assertTrue(np.all(np.abs(accs - priors) < tol)) print("Finished in {0} sec.".format(time() - t0)) # Now test that estimated LF accs are not too far off print("\nTesting estimated LF accs (TOL=%s)" % tol) t0 = time() gen_model.train(L, LF_acc_prior_weights=LF_acc_prior_weights, labels=labels, reg_type=0, reg_param=0.0, candidate_ranges=candidate_ranges) stats = gen_model.learned_lf_stats() accs = stats["Accuracy"] coverage = stats["Coverage"] print(accs) print(coverage) priors = np.array(LF_acc_priors + [label_prior]) self.assertTrue(np.all(np.abs(accs - priors) < tol)) self.assertTrue( np.all(np.abs(coverage - np.array([1, 1, 1, 1, 0.2, 0.1]) < tol))) print("Finished in {0} sec.".format(time() - t0)) # Test without supervised print("\nTesting without supervised") t0 = time() gen_model = GenerativeModel(lf_propensity=True) gen_model.train(L, reg_type=0, candidate_ranges=candidate_ranges) stats = gen_model.learned_lf_stats() accs = stats["Accuracy"] coverage = stats["Coverage"] print(accs) print(coverage) priors = np.array(LF_acc_priors) self.assertTrue(np.all(np.abs(accs - priors) < tol)) self.assertTrue( np.all(np.abs(coverage - np.array([1, 1, 1, 1, 0.2]) < tol))) print("Finished in {0} sec.".format(time() - t0)) # Test with supervised print("\nTesting with supervised, without priors") t0 = time() gen_model = GenerativeModel(lf_propensity=True) gen_model.train(L, labels=labels, reg_type=0, candidate_ranges=candidate_ranges) stats = gen_model.learned_lf_stats() accs = stats["Accuracy"] coverage = stats["Coverage"] print(accs) print(coverage) priors = np.array(LF_acc_priors + [label_prior]) self.assertTrue(np.all(np.abs(accs - priors) < tol)) self.assertTrue( np.all(np.abs(coverage - np.array([1, 1, 1, 1, 0.2, 0.1]) < tol))) print("Finished in {0} sec.".format(time() - t0)) # Test without supervised, and (intentionally) bad priors, but weak strength print("\nTesting without supervised, with bad priors (weak)") t0 = time() gen_model = GenerativeModel(lf_propensity=True) bad_prior = [0.9, 0.8, 0.7, 0.6, 0.5] bad_prior_weights = map( lambda x: 0.5 * np.log((cardinality - 1.0) * x / (1 - x)), bad_prior) gen_model.train(L, LF_acc_prior_weights=bad_prior_weights, reg_type=0, candidate_ranges=candidate_ranges) stats = gen_model.learned_lf_stats() accs = stats["Accuracy"] coverage = stats["Coverage"] print(accs) print(coverage) priors = np.array(LF_acc_priors) self.assertTrue(np.all(np.abs(accs - priors) < tol)) print("Finished in {0} sec.".format(time() - t0)) # Test without supervised, and (intentionally) bad priors print("\nTesting without supervised, with bad priors (strong)") t0 = time() gen_model = GenerativeModel(lf_propensity=True) gen_model.train(L, LF_acc_prior_weights=bad_prior_weights, reg_type=2, reg_param=100 * n, candidate_ranges=candidate_ranges) stats = gen_model.learned_lf_stats() accs = stats["Accuracy"] coverage = stats["Coverage"] print(accs) self.assertTrue(np.all(np.abs(accs - np.array(bad_prior)) < tol)) print("Finished in {0} sec.".format(time() - t0))
def test_compile_no_deps(self): # Defines a label matrix L = sparse.lil_matrix((5, 3)) # The first LF always says yes L[0, 0] = 1 L[1, 0] = 1 L[2, 0] = 1 L[3, 0] = 1 L[4, 0] = 1 # The second LF votes differently L[0, 1] = 1 L[2, 1] = -1 L[4, 1] = 1 # The third LF always abstains # Tests compilation gen_model = GenerativeModel(class_prior=True, lf_prior=False, lf_propensity=False, lf_class_propensity=False) gen_model._process_dependency_graph(L, ()) m, n = L.shape LF_acc_prior_weights = [1.0 for _ in range(n)] is_fixed = [False for _ in range(n)] gen_model.cardinality = 2 cardinalities = 2 * np.ones(5) weight, variable, factor, ftv, domain_mask, n_edges =\ gen_model._compile(L, 0.5, 0.0, LF_acc_prior_weights, is_fixed, cardinalities) # # Weights # # Should now be 3 for LFs + 3 (fixed) for LF priors + 1 class prior self.assertEqual(len(weight), 7) self.assertFalse(weight[0]['isFixed']) self.assertEqual(weight[0]['initialValue'], 0.0) # The LF priors for i in range(1,7,2): self.assertTrue(weight[i]['isFixed']) self.assertEqual(weight[i]['initialValue'], 1.0) # The LF weights for i in range(2,7,2): self.assertFalse(weight[i]['isFixed']) self.assertEqual(weight[i]['initialValue'], 0.0) # # Variables # self.assertEqual(len(variable), 20) for i in range(5): self.assertEqual(variable[i]['isEvidence'], 0) self.assertTrue(variable[i]['initialValue'] == 0 or variable[i]['initialValue'] == 1) self.assertEqual(variable[i]["dataType"], 0) self.assertEqual(variable[i]["cardinality"], 2) for i in range(5): for j in range(3): self.assertEqual(variable[5 + i * 3 + j]['isEvidence'], 1) # Remap label value; abstain is 0 in L, cardinality (= 2) in NS if L[i, j] == -1: l = 0 elif L[i, j] == 0: l = 2 elif L[i,j] == 1: l = 1 self.assertEqual(variable[5 + i * 3 + j]['initialValue'], l) self.assertEqual(variable[5 + i * 3 + j]["dataType"], 0) self.assertEqual(variable[5 + i * 3 + j]["cardinality"], 3) # # Factors # # 5 * 3 LF acc factors + 5 * 3 LF prior factors + 5 class prior factors self.assertEqual(len(factor), 35) for i in range(5): self.assertEqual(factor[i]["factorFunction"], FACTORS["DP_GEN_CLASS_PRIOR"]) self.assertEqual(factor[i]["weightId"], 0) self.assertEqual(factor[i]["featureValue"], 1) self.assertEqual(factor[i]["arity"], 1) self.assertEqual(factor[i]["ftv_offset"], i) for i in range(5): for j in range(6): self.assertEqual(factor[5 + i * 6 + j]["factorFunction"], FACTORS["DP_GEN_LF_ACCURACY"]) self.assertEqual(factor[5 + i * 6 + j]["weightId"], j + 1) self.assertEqual(factor[5 + i * 6 + j]["featureValue"], 1) self.assertEqual(factor[5 + i * 6 + j]["arity"], 2) self.assertEqual(factor[5 + i * 6 + j]["ftv_offset"], 5 + 2 * (i * 6 + j)) # # Factor to Var # self.assertEqual(len(ftv), 65) # Class prior factor - var edges for i in range(5): self.assertEqual(ftv[i]["vid"], i) self.assertEqual(ftv[i]["dense_equal_to"], 0) # LF *and LF prior* factor - var edges for i in range(5): for j in range(3): # Each LF has one weight factor and one prior factor here for k in range(2): idx = 4 * (i * 3 + j) + 2 * k self.assertEqual(ftv[5 + idx]["vid"], i) self.assertEqual(ftv[6 + idx]["vid"], 5 + i * 3 + j) self.assertEqual(ftv[5 + idx]["dense_equal_to"], 0) self.assertEqual(ftv[6 + idx]["dense_equal_to"], 0) # # Domain mask # self.assertEqual(len(domain_mask), 20) for i in range(20): self.assertFalse(domain_mask[i]) # n_edges self.assertEqual(n_edges, 65)
def test_compile_with_deps(self): # Defines a label matrix L = sparse.lil_matrix((5, 3)) # The first LF always says yes L[0, 0] = 1 L[1, 0] = 1 L[2, 0] = 1 L[3, 0] = 1 L[4, 0] = 1 # The second LF votes differently L[0, 1] = 1 L[2, 1] = -1 L[4, 1] = 1 # The third LF always abstains # Defined dependencies deps = [] deps.append((0, 1, DEP_SIMILAR)) deps.append((0, 2, DEP_SIMILAR)) deps.append((0, 1, DEP_FIXING)) deps.append((0, 2, DEP_REINFORCING)) deps.append((1, 2, DEP_EXCLUSIVE)) # Tests compilation gen_model = GenerativeModel(class_prior=False, lf_prior=False, lf_propensity=True, lf_class_propensity=False) gen_model._process_dependency_graph(L, deps) m, n = L.shape LF_acc_prior_weights = [1.0 for _ in range(n)] is_fixed = [False for _ in range(n)] gen_model.cardinality = 2 cardinalities = 2 * np.ones(5) weight, variable, factor, ftv, domain_mask, n_edges =\ gen_model._compile(L, 0.5, -1.0, LF_acc_prior_weights, is_fixed, cardinalities) # # Weights # # Should now be 3 for LFs + 3 fixed for LF priors + 3 for LF propensity # + 5 for deps self.assertEqual(len(weight), 14) # The LF priors for i in range(0,6,2): self.assertTrue(weight[i]['isFixed']) self.assertEqual(weight[i]['initialValue'], 1.0) # The LF weights for i in range(1,6,2): self.assertFalse(weight[i]['isFixed']) self.assertEqual(weight[i]['initialValue'], 0.0) # The dep weights for i in range(6, 14): self.assertFalse(weight[i]['isFixed']) self.assertEqual(weight[i]['initialValue'], 0.5) # # Variables # self.assertEqual(len(variable), 20) for i in range(5): self.assertEqual(variable[i]['isEvidence'], 0) self.assertTrue(variable[i]['initialValue'] == 0 or variable[i]['initialValue'] == 1) self.assertEqual(variable[i]["dataType"], 0) self.assertEqual(variable[i]["cardinality"], 2) for i in range(5): for j in range(3): self.assertEqual(variable[5 + i * 3 + j]['isEvidence'], 1) # Remap label value; abstain is 0 in L, cardinality (= 2) in NS if L[i, j] == -1: l = 0 elif L[i, j] == 0: l = 2 elif L[i,j] == 1: l = 1 self.assertEqual(variable[5 + i * 3 + j]['initialValue'], l) self.assertEqual(variable[5 + i * 3 + j]["dataType"], 0) self.assertEqual(variable[5 + i * 3 + j]["cardinality"], 3) # # Factors # self.assertEqual(len(factor), 70) f_offset = 0 ftv_offset = 0 for i in range(5): for j in range(6): self.assertEqual(factor[f_offset + i * 6+ j]["factorFunction"], FACTORS["DP_GEN_LF_ACCURACY"]) self.assertEqual(factor[f_offset + i * 6 + j]["weightId"], j) self.assertEqual(factor[f_offset + i * 6 + j]["featureValue"], 1) self.assertEqual(factor[f_offset + i * 6 + j]["arity"], 2) self.assertEqual(factor[f_offset + i * 6 + j]["ftv_offset"], ftv_offset + 2 * (i * 6 + j)) f_offset = 30 ftv_offset = 60 for i in range(5): for j in range(3): self.assertEqual(factor[f_offset + i * 3 + j]["factorFunction"], FACTORS["DP_GEN_LF_PROPENSITY"]) self.assertEqual(factor[f_offset + i * 3 + j]["weightId"], 6 + j) self.assertEqual(factor[f_offset + i * 3 + j]["featureValue"], 1) self.assertEqual(factor[f_offset + i * 3 + j]["arity"], 1) self.assertEqual(factor[f_offset + i * 3 + j]["ftv_offset"], ftv_offset + (i * 3 + j)) f_offset = 45 ftv_offset = 75 for i in range(5): self.assertEqual(factor[f_offset + i]["factorFunction"], FACTORS["DP_GEN_DEP_SIMILAR"]) self.assertEqual(factor[f_offset + i]["weightId"], 9) self.assertEqual(factor[f_offset + i]["featureValue"], 1) self.assertEqual(factor[f_offset + i]["arity"], 2) self.assertEqual(factor[f_offset + i]["ftv_offset"], ftv_offset + 2 * i) f_offset = 50 ftv_offset = 85 for i in range(5): self.assertEqual(factor[f_offset + i]["factorFunction"], FACTORS["DP_GEN_DEP_SIMILAR"]) self.assertEqual(factor[f_offset + i]["weightId"], 10) self.assertEqual(factor[f_offset + i]["featureValue"], 1) self.assertEqual(factor[f_offset + i]["arity"], 2) self.assertEqual(factor[f_offset + i]["ftv_offset"], ftv_offset + 2 * i) f_offset = 55 ftv_offset = 95 for i in range(5): self.assertEqual(factor[f_offset + i]["factorFunction"], FACTORS["DP_GEN_DEP_FIXING"]) self.assertEqual(factor[f_offset + i]["weightId"], 11) self.assertEqual(factor[f_offset + i]["featureValue"], 1) self.assertEqual(factor[f_offset + i]["arity"], 3) self.assertEqual(factor[f_offset + i]["ftv_offset"], ftv_offset + 3 * i) f_offset = 60 ftv_offset = 110 for i in range(5): self.assertEqual(factor[f_offset + i]["factorFunction"], FACTORS["DP_GEN_DEP_REINFORCING"]) self.assertEqual(factor[f_offset + i]["weightId"], 12) self.assertEqual(factor[f_offset + i]["featureValue"], 1) self.assertEqual(factor[f_offset + i]["arity"], 3) self.assertEqual(factor[f_offset + i]["ftv_offset"], ftv_offset + 3 * i) f_offset = 65 ftv_offset = 125 for i in range(5): self.assertEqual(factor[f_offset + i]["factorFunction"], FACTORS["DP_GEN_DEP_EXCLUSIVE"]) self.assertEqual(factor[f_offset + i]["weightId"], 13) self.assertEqual(factor[f_offset + i]["featureValue"], 1) self.assertEqual(factor[f_offset + i]["arity"], 2) self.assertEqual(factor[f_offset + i]["ftv_offset"], ftv_offset + 2 * i) # # Factor to Var # self.assertEqual(len(ftv), 135) ftv_offset = 0 for i in range(5): for j in range(3): for k in range(2): self.assertEqual(ftv[ftv_offset + 4 * (i * 3 + j) + 2 * k]["vid"], i) self.assertEqual(ftv[ftv_offset + 4 * (i * 3 + j) + 2 * k]["dense_equal_to"], 0) self.assertEqual(ftv[ftv_offset + 4 * (i * 3 + j) + 2 * k + 1]["vid"], 5 + i * 3 + j) self.assertEqual(ftv[ftv_offset + 4 * (i * 3 + j) + 2 * k + 1]["dense_equal_to"], 0) ftv_offset = 60 for i in range(5): for j in range(3): self.assertEqual(ftv[ftv_offset + (i * 3 + j)]["vid"], 5 + i * 3 + j) self.assertEqual(ftv[ftv_offset + (i * 3 + j)]["dense_equal_to"], 0) ftv_offset = 75 for i in range(5): self.assertEqual(ftv[ftv_offset + 2 * i]["vid"], 5 + i * 3) self.assertEqual(ftv[ftv_offset + 2 * i]["dense_equal_to"], 0) self.assertEqual(ftv[ftv_offset + 2 * i + 1]["vid"], 5 + i * 3 + 1) self.assertEqual(ftv[ftv_offset + 2 * i + 1]["dense_equal_to"], 0) ftv_offset = 85 for i in range(5): self.assertEqual(ftv[ftv_offset + 2 * i]["vid"], 5 + i * 3) self.assertEqual(ftv[ftv_offset + 2 * i]["dense_equal_to"], 0) self.assertEqual(ftv[ftv_offset + 2 * i + 1]["vid"], 5 + i * 3 + 2) self.assertEqual(ftv[ftv_offset + 2 * i + 1]["dense_equal_to"], 0) ftv_offset = 95 for i in range(5): self.assertEqual(ftv[ftv_offset + 3 * i]["vid"], i) self.assertEqual(ftv[ftv_offset + 3 * i]["dense_equal_to"], 0) self.assertEqual(ftv[ftv_offset + 3 * i + 1]["vid"], 5 + i * 3) self.assertEqual(ftv[ftv_offset + 3 * i + 1]["dense_equal_to"], 0) self.assertEqual(ftv[ftv_offset + 3 * i + 2]["vid"], 5 + i * 3 + 1) self.assertEqual(ftv[ftv_offset + 3 * i + 2]["dense_equal_to"], 0) ftv_offset = 110 for i in range(5): self.assertEqual(ftv[ftv_offset + 3 * i]["vid"], i) self.assertEqual(ftv[ftv_offset + 3 * i]["dense_equal_to"], 0) self.assertEqual(ftv[ftv_offset + 3 * i + 1]["vid"], 5 + i * 3) self.assertEqual(ftv[ftv_offset + 3 * i + 1]["dense_equal_to"], 0) self.assertEqual(ftv[ftv_offset + 3 * i + 2]["vid"], 5 + i * 3 + 2) self.assertEqual(ftv[ftv_offset + 3 * i + 2]["dense_equal_to"], 0) ftv_offset = 125 for i in range(5): self.assertEqual(ftv[ftv_offset + 2 * i]["vid"], 5 + i * 3 + 1) self.assertEqual(ftv[ftv_offset + 2 * i]["dense_equal_to"], 0) self.assertEqual(ftv[ftv_offset + 2 * i + 1]["vid"], 5 + i * 3 + 2) self.assertEqual(ftv[ftv_offset + 2 * i + 1]["dense_equal_to"], 0) # # Domain mask # self.assertEqual(len(domain_mask), 20) for i in range(20): self.assertFalse(domain_mask[i]) # n_edges self.assertEqual(n_edges, 135)