def create_eck(self, n): """ @param n: get single data item frequency """ eck = {}.fromkeys(self.data_combinations) for key in self.data_combinations: eck[key] = {} data_combinations2 = self.data_combinations[:] for k1 in self.data_combinations: for k2 in data_combinations2: if k1 == () and k2 == (): eck[k1][k2] = n[()] * (n[()] - 1) elif k1 == (): eck[k1][k2] = n[()] * Util.prod([n[x] for x in k2]) elif k2 == (): eck[k1][k2] = Util.prod([n[x] for x in k1]) * n[()] else: eck[k1][k2] = Util.prod([n[x] for x in k1]) * Util.prod( [(n[x] - 1) if x in k1 else n[x] for x in k2]) data_combinations2.remove(k1) for k3 in data_combinations2: eck[k3][k1] = eck[k1][k3] return eck
def setUpClass(cls): data = Util.read_data( os.path.join( os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "data", "music.dat"))[0] cls.od = ObservedDisagreement(data) cls.ock = cls.od.create_ock()
def test_read_data(self): print print "{0}testing read data{0}".format("*" * 6) data = Util.read_data(self.filepath)[0] print "data = {0}".format(data) self.assertEquals(data, [[("Funk", "Rock"), ("Rock",)], [("Funk", "Jazz", "Rock"), ("Funk",), ("Funk", "Rock")], [(), ("Jazz",), ("Jazz",)]], "data should be read as expected") flat_data = Util.flat_case_data(data) print "flat_data = {0}".format(flat_data) self.assertEquals(flat_data, [('Funk', 'Rock'), ('Rock',), ('Funk', 'Jazz', 'Rock'), ('Funk',), (), ('Jazz',)]) data_items = Util.get_data_item(flat_data) print "data_items={0}".format(data_items) self.assertTrue(() in data_items, "empty set should be included in unique single items") data_combination = Util.get_data_item_combination(data_items) print "data_combination = {0}".format(data_combination) print "{0}tested read data{0}".format("*" * 6) print
def __init__(self, data): ''' @param data: a list of lists. Each sublist stands for the observations for a particular object ''' assert isinstance(data, list) and isinstance(data[0], list) and isinstance(data[0][0], tuple) and isinstance(data[0][0][0], str) self.data = data self.flat_data = Util.flat_case_data(data)
def test_prod(self): print print "{0}testing multiplication of list of numbers{0}".format("*" * 6) d = [] m = Util.prod(d) print "prod({{}}) = {0}".format(m) self.assertEqual(m, 1, "prod({}) = 1") d = [2] m = Util.prod(d) print "prod({{2}}) = {0}".format(m) self.assertEqual(m, 2, "prod({2}) = 2") d = [1, 2, 3, 4] m = Util.prod(d) print "prod({{1,2,3,4}}) = {0}".format(m) self.assertEqual(m, 24, "m({{1,2,3,4}}) = 24") print "{0}tested multiplication of list of numbers{0}".format("*" * 6) print
def test_prod(self): print print "{0}testing multiplication of list of numbers{0}".format("*" * 6) d = [] m = Util.prod(d) print "prod({{}}) = {0}".format(m) self.assertEqual(m, 1, "prod({}) = 1") d = [2] m = Util.prod(d) print "prod({{2}}) = {0}".format(m) self.assertEqual(m, 2, "prod({2}) = 2") d = [1, 2, 3, 4] m = Util.prod(d) print "prod({{1,2,3,4}}) = {0}".format(m) self.assertEqual(m, 24, "m({{1,2,3,4}}) = 24") print "{0}tested multiplication of list of numbers{0}".format("*" * 6) print
def get_single_case_ed(self, item1, item2, p, n, d): """ @param item1: multi-value item1 @param item2: multi-value item2 @param p: output of get_num_p @param n: output of get_single_item_n @param d: output of get_ways_of_pair @return: expected disagreement of item1 and item2 (note: this is not symmetric, i.e., get_single_case_ed(item1, item2) != get_single_case_ed(item2, item1) """ def get_complementary_set(t): return [x for x in n.iterkeys() if x not in t] def intersection(t, s): return [tt for tt in t if tt in s] if not p.has_key(len(item1)): return 0 else: ratio1 = p[len(item1)] if not p.has_key(len(item2)): return 0 else: ratio2 = p[len(item2)] if d[len(item1)][len(item2)] == 0: denominator = 1 else: denominator = d[len(item1)][len(item2)] if item1 == () and item2 == (): numerator = n[()] * (n[()] - 1) elif item1 == (): numerator = n[()] * Util.prod([n[x] for x in item2]) elif item2 == (): numerator = Util.prod([n[x] for x in item1]) * n[()] else: numerator = Util.prod([n[x] for x in item1]) * Util.prod([n[x] for x in intersection(item2, get_complementary_set(item1))]) * Util.prod([(n[x] - 1) for x in intersection(item1, item2)]) delta = Util.distance(item1, item2) return ratio1 * ratio2 * numerator * delta / denominator
def setUpClass(cls): base_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "data") cls.data = Util.read_data(os.path.join(base_path, "comments.dat")) print print "{0}data{0}".format("*" * 6) for c, d in enumerate(cls.data): print "data{0}:{1}".format(c, d) print "{0}data{0}".format("*" * 6) print
def get_single_case_ed(self, item1, item2, p, n, d): """ @param item1: multi-value item1 @param item2: multi-value item2 @param p: output of get_num_p @param n: output of get_single_item_n @param d: output of get_ways_of_pair @return: expected disagreement of item1 and item2 (note: this is not symmetric, i.e., get_single_case_ed(item1, item2) != get_single_case_ed(item2, item1) """ def get_complementary_set(t): return [x for x in n.iterkeys() if x not in t] def intersection(t, s): return [tt for tt in t if tt in s] if not p.has_key(len(item1)): return 0 else: ratio1 = p[len(item1)] if not p.has_key(len(item2)): return 0 else: ratio2 = p[len(item2)] if d[len(item1)][len(item2)] == 0: denominator = 1 else: denominator = d[len(item1)][len(item2)] if item1 == () and item2 == (): numerator = n[()] * (n[()] - 1) elif item1 == (): numerator = n[()] * Util.prod([n[x] for x in item2]) elif item2 == (): numerator = Util.prod([n[x] for x in item1]) * n[()] else: numerator = Util.prod([n[x] for x in item1]) * Util.prod([n[x] for x in intersection(item2, get_complementary_set(item1))]) * Util.prod([(n[x] - 1) for x in intersection(item1, item2)]) delta = Util.distance(item1, item2) return ratio1 * ratio2 * numerator * delta / denominator
def __init__(self, data): ''' @param data: a list of lists. Each sublist stands for the observations for a particular object ''' assert isinstance(data, list) and isinstance( data[0], list) and isinstance(data[0][0], tuple) and isinstance( data[0][0][0], str) self.data = data self.flat_data = Util.flat_case_data(data)
def get_disagreement(self, ock): ndotdot = sum([ sum([ock[f1][f2] for f2 in self.flat_data]) for f1 in self.flat_data ]) distanced_ock_sum = sum([ sum([ock[f1][f2] * Util.distance(f1, f2) for f2 in self.flat_data]) for f1 in self.flat_data ]) return distanced_ock_sum / ndotdot
def test_read_data(self): print print "{0}testing read data{0}".format("*" * 6) data = Util.read_data(self.filepath)[0] print "data = {0}".format(data) self.assertEquals(data, [[("Funk", "Rock"), ("Rock", )], [("Funk", "Jazz", "Rock"), ("Funk", ), ("Funk", "Rock")], [(), ("Jazz", ), ("Jazz", )]], "data should be read as expected") flat_data = Util.flat_case_data(data) print "flat_data = {0}".format(flat_data) self.assertEquals(flat_data, [('Funk', 'Rock'), ('Rock', ), ('Funk', 'Jazz', 'Rock'), ('Funk', ), (), ('Jazz', )]) data_items = Util.get_data_item(flat_data) print "data_items={0}".format(data_items) self.assertTrue(() in data_items, "empty set should be included in unique single items") data_combination = Util.get_data_item_combination(data_items) print "data_combination = {0}".format(data_combination) print "{0}tested read data{0}".format("*" * 6) print
def test_distance(self): print print "{0}testing distance{0}".format("*" * 6) d1 = () print "distance({{}}, {{}}) = {0}".format(Util.distance(d1, d1)) self.assertEquals(Util.distance(d1, d1), 0, "distance({Empty}, {Empty}) = 0") d2 = ("Funk", "Rock") print "distance({{Funk}}, {{Rock}}) = {0}".format(Util.distance(d1, d2)) self.assertEquals(Util.distance(d1, d2), 1.0, "distance({Empty}, {Rock, Funk}) = 1") d3 = ("Funk", "Rock") d4 = ("Funk", "Jazz", "Rock") print "distance({{Funk, Rock}}, {{Funk, Rock, Jazz}}) = {0}".format(Util.distance(d3, d4)) self.assertAlmostEqual(Util.distance(d3, d4), 0.2, 5, "distance({Funk, Rock}, {Funk, Rock, Jazz}) = 0.2") self.assertEquals(Util.distance(d3, d4), Util.distance(d4, d3), "distance is symmetric") print "{0}tested distance{0}".format("*" * 6) print
def create_eck(self, n): """ @param n: get single data item frequency """ eck = {}.fromkeys(self.data_combinations) for key in self.data_combinations: eck[key] = {} data_combinations2 = self.data_combinations[:] for k1 in self.data_combinations: for k2 in data_combinations2: if k1 == () and k2 == (): eck[k1][k2] = n[()] * (n[()] - 1) elif k1 == (): eck[k1][k2] = n[()] * Util.prod([n[x] for x in k2]) elif k2 == (): eck[k1][k2] = Util.prod([n[x] for x in k1]) * n[()] else: eck[k1][k2] = Util.prod([n[x] for x in k1]) * Util.prod([(n[x] - 1) if x in k1 else n[x] for x in k2]) data_combinations2.remove(k1) for k3 in data_combinations2: eck[k3][k1] = eck[k1][k3] return eck
def create_ock(self): ock = {}.fromkeys(self.flat_data) for key in ock.iterkeys(): ock[key] = {} flat_data_2 = self.flat_data[:] for obs in self.flat_data: for obs2 in flat_data_2: ock[obs][obs2] = sum([ Util.coincidence(d, obs, obs2) for d in self.data if obs in d and obs2 in d ]) flat_data_2.remove(obs) for obs2 in flat_data_2: # copy the symmetric items ock[obs2][obs] = ock[obs][obs2] return ock
def create_ock(self): ock = {}.fromkeys(self.flat_data) for key in ock.iterkeys(): ock[key] = {} flat_data_2 = self.flat_data[:] for obs in self.flat_data: for obs2 in flat_data_2: ock[obs][obs2] = sum([ Util.coincidence(d, obs, obs2) for d in self.data if obs in d and obs2 in d ]) flat_data_2.remove(obs) for obs2 in flat_data_2: # copy the symmetric items ock[obs2][obs] = ock[obs][obs2] return ock
def main(): if len(sys.argv) != 2: print "python Main %s" sys.exit(-1) data = Util.read_data(os.path.join(base_path, sys.argv[1])) for c, d in enumerate(data): od = ObservedDisagreement(d) ock = od.create_ock() od_value = od.get_disagreement(ock) ed = ExpectedDisagreement(od, ock) ed_value = ed.get() alpha = 1 - od_value / ed_value print "data[{0}] = {1}".format(c, d) print "alpha = {0}".format(alpha)
def main(): if len(sys.argv) != 2: print "python Main %s" sys.exit(-1) data = Util.read_data(os.path.join(base_path, sys.argv[1])) for c, d in enumerate(data): od = ObservedDisagreement(d) ock = od.create_ock() od_value = od.get_disagreement(ock) ed = ExpectedDisagreement(od, ock) ed_value = ed.get() alpha = 1 - od_value / ed_value print "data[{0}] = {1}".format(c, d) print "alpha = {0}".format(alpha)
def test_coincidence(self): print print "{0}testing coincidence{0}".format("*" * 6) data = [(), ('Jazz',), ('Jazz',)] self.assertEquals(Util.list_item_count(data, ('Jazz', )), 2, "Jazz occures for twice") self.assertEquals(Util.coincidence(data, (), ('Jazz',)), 1.0, "coincidence of {} and (Jazz,) is 1") self.assertEquals(Util.coincidence(data, ('Jazz', ), ('Jazz',)), 1.0, "coincidence of (Jazz,) and (Jazz,) is 1") data2 = [('Funk', 'Rock', 'Jazz'), ('Funk',), ('Funk', 'Rock')] self.assertEquals(Util.coincidence(data2, ('Funk', ), ('Funk', 'Rock')), 0.5, 'coincidence of (Funk,) and (Funk, Rock) is 0.5') self.assertEquals(Util.coincidence(data2, ('Funk', ), ('Funk', 'Rock')), Util.coincidence(data2, ('Funk', 'Rock'), ('Funk',)), 'coincidence is symmetric') print "{0}tested coincidence{0}".format("*" * 6) print
def test_distance(self): print print "{0}testing distance{0}".format("*" * 6) d1 = () print "distance({{}}, {{}}) = {0}".format(Util.distance(d1, d1)) self.assertEquals(Util.distance(d1, d1), 0, "distance({Empty}, {Empty}) = 0") d2 = ("Funk", "Rock") print "distance({{Funk}}, {{Rock}}) = {0}".format(Util.distance( d1, d2)) self.assertEquals(Util.distance(d1, d2), 1.0, "distance({Empty}, {Rock, Funk}) = 1") d3 = ("Funk", "Rock") d4 = ("Funk", "Jazz", "Rock") print "distance({{Funk, Rock}}, {{Funk, Rock, Jazz}}) = {0}".format( Util.distance(d3, d4)) self.assertAlmostEqual( Util.distance(d3, d4), 0.2, 5, "distance({Funk, Rock}, {Funk, Rock, Jazz}) = 0.2") self.assertEquals(Util.distance(d3, d4), Util.distance(d4, d3), "distance is symmetric") print "{0}tested distance{0}".format("*" * 6) print
def test_coincidence(self): print print "{0}testing coincidence{0}".format("*" * 6) data = [(), ('Jazz', ), ('Jazz', )] self.assertEquals(Util.list_item_count(data, ('Jazz', )), 2, "Jazz occures for twice") self.assertEquals(Util.coincidence(data, (), ('Jazz', )), 1.0, "coincidence of {} and (Jazz,) is 1") self.assertEquals(Util.coincidence(data, ('Jazz', ), ('Jazz', )), 1.0, "coincidence of (Jazz,) and (Jazz,) is 1") data2 = [('Funk', 'Rock', 'Jazz'), ('Funk', ), ('Funk', 'Rock')] self.assertEquals( Util.coincidence(data2, ('Funk', ), ('Funk', 'Rock')), 0.5, 'coincidence of (Funk,) and (Funk, Rock) is 0.5') self.assertEquals( Util.coincidence(data2, ('Funk', ), ('Funk', 'Rock')), Util.coincidence(data2, ('Funk', 'Rock'), ('Funk', )), 'coincidence is symmetric') print "{0}tested coincidence{0}".format("*" * 6) print
flat_data_2 = self.flat_data[:] for obs in self.flat_data: for obs2 in flat_data_2: ock[obs][obs2] = sum([ Util.coincidence(d, obs, obs2) for d in self.data if obs in d and obs2 in d ]) flat_data_2.remove(obs) for obs2 in flat_data_2: # copy the symmetric items ock[obs2][obs] = ock[obs][obs2] return ock def get_disagreement(self, ock): ndotdot = sum([sum([ock[f1][f2] for f2 in self.flat_data]) for f1 in self.flat_data]) distanced_ock_sum = sum([sum([ock[f1][f2] * Util.distance(f1, f2) for f2 in self.flat_data]) for f1 in self.flat_data]) return distanced_ock_sum / ndotdot def get(self): ock = self.create_ock() return self.get_disagreement(ock) if __name__ == "__main__": import os data = Util.read_data(os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "data", "music.dat"))[0] od = ObservedDisagreement(data)
def __init__(self, observed_disagreement, ock): self.od = observed_disagreement self.ock = ock self.data_item = Util.get_data_item(self.od.flat_data) self.data_combinations = Util.get_data_item_combination(self.data_item)
def get_disagreement(self, ock): ndotdot = sum([sum([ock[f1][f2] for f2 in self.flat_data]) for f1 in self.flat_data]) distanced_ock_sum = sum([sum([ock[f1][f2] * Util.distance(f1, f2) for f2 in self.flat_data]) for f1 in self.flat_data]) return distanced_ock_sum / ndotdot
def __init__(self, observed_disagreement, ock): self.od = observed_disagreement self.ock = ock self.data_item = Util.get_data_item(self.od.flat_data) self.data_combinations = Util.get_data_item_combination(self.data_item)
def setUpClass(cls): data = Util.read_data(os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "data", "music.dat"))[0] od = ObservedDisagreement(data) ock = od.create_ock() cls.ed = ExpectedDisagreement(od, ock)
for obs2 in flat_data_2: # copy the symmetric items ock[obs2][obs] = ock[obs][obs2] return ock def get_disagreement(self, ock): ndotdot = sum([ sum([ock[f1][f2] for f2 in self.flat_data]) for f1 in self.flat_data ]) distanced_ock_sum = sum([ sum([ock[f1][f2] * Util.distance(f1, f2) for f2 in self.flat_data]) for f1 in self.flat_data ]) return distanced_ock_sum / ndotdot def get(self): ock = self.create_ock() return self.get_disagreement(ock) if __name__ == "__main__": import os data = Util.read_data( os.path.join( os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "data", "music.dat"))[0] od = ObservedDisagreement(data)