Esempio n. 1
0
    def create_eck(self, n):
        """
        @param n: get single data item frequency 
        """
        eck = {}.fromkeys(self.data_combinations)
        for key in self.data_combinations:
            eck[key] = {}

        data_combinations2 = self.data_combinations[:]
        for k1 in self.data_combinations:
            for k2 in data_combinations2:
                if k1 == () and k2 == ():
                    eck[k1][k2] = n[()] * (n[()] - 1)
                elif k1 == ():
                    eck[k1][k2] = n[()] * Util.prod([n[x] for x in k2])
                elif k2 == ():
                    eck[k1][k2] = Util.prod([n[x] for x in k1]) * n[()]
                else:
                    eck[k1][k2] = Util.prod([n[x] for x in k1]) * Util.prod(
                        [(n[x] - 1) if x in k1 else n[x] for x in k2])
            data_combinations2.remove(k1)
            for k3 in data_combinations2:
                eck[k3][k1] = eck[k1][k3]

        return eck
 def setUpClass(cls):
     data = Util.read_data(
         os.path.join(
             os.path.dirname(os.path.dirname(os.path.dirname(__file__))),
             "data", "music.dat"))[0]
     cls.od = ObservedDisagreement(data)
     cls.ock = cls.od.create_ock()
Esempio n. 3
0
 def test_read_data(self):
     print
     print "{0}testing read data{0}".format("*" * 6) 
     data = Util.read_data(self.filepath)[0]
     print "data = {0}".format(data)
     self.assertEquals(data, [[("Funk", "Rock"), ("Rock",)], [("Funk", "Jazz", "Rock"), ("Funk",), ("Funk", "Rock")], [(), ("Jazz",), ("Jazz",)]], "data should be read as expected")
     flat_data = Util.flat_case_data(data)
     print "flat_data = {0}".format(flat_data)
     self.assertEquals(flat_data, [('Funk', 'Rock'), ('Rock',), ('Funk', 'Jazz', 'Rock'), ('Funk',), (), ('Jazz',)])
     data_items = Util.get_data_item(flat_data)
     print "data_items={0}".format(data_items)
     self.assertTrue(() in data_items, "empty set should be included in unique single items")
     data_combination = Util.get_data_item_combination(data_items)
     print "data_combination = {0}".format(data_combination)
     print "{0}tested read data{0}".format("*" * 6)
     print
Esempio n. 4
0
 def __init__(self, data):
     '''
     @param data: a list of lists. Each sublist stands for the observations for a particular object
     '''
     
     assert isinstance(data, list) and isinstance(data[0], list) and isinstance(data[0][0], tuple) and isinstance(data[0][0][0], str)
     self.data = data
     self.flat_data = Util.flat_case_data(data)
Esempio n. 5
0
 def test_prod(self):
     print
     print "{0}testing multiplication of list of numbers{0}".format("*" * 6)
     d = []
     m = Util.prod(d)
     print "prod({{}}) = {0}".format(m)
     self.assertEqual(m, 1, "prod({}) = 1")
     d = [2]
     m = Util.prod(d)
     print "prod({{2}}) = {0}".format(m)
     self.assertEqual(m, 2, "prod({2}) = 2")
     d = [1, 2, 3, 4]
     m = Util.prod(d)
     print "prod({{1,2,3,4}}) = {0}".format(m)
     self.assertEqual(m, 24, "m({{1,2,3,4}}) = 24")
     print "{0}tested multiplication of list of numbers{0}".format("*" * 6)
     print
Esempio n. 6
0
 def test_prod(self):
     print
     print "{0}testing multiplication of list of numbers{0}".format("*" * 6)
     d = []
     m = Util.prod(d)
     print "prod({{}}) = {0}".format(m)
     self.assertEqual(m, 1, "prod({}) = 1")
     d = [2]
     m = Util.prod(d)
     print "prod({{2}}) = {0}".format(m)
     self.assertEqual(m, 2, "prod({2}) = 2")
     d = [1, 2, 3, 4]
     m = Util.prod(d)
     print "prod({{1,2,3,4}}) = {0}".format(m)
     self.assertEqual(m, 24, "m({{1,2,3,4}}) = 24")
     print "{0}tested multiplication of list of numbers{0}".format("*" * 6)
     print
Esempio n. 7
0
 def get_single_case_ed(self, item1, item2, p, n, d):
     """
     @param item1: multi-value item1
     @param item2: multi-value item2
     @param p: output of get_num_p
     @param n: output of get_single_item_n
     @param d: output of get_ways_of_pair
     
     @return: expected disagreement of item1 and item2 (note: this is not symmetric, i.e., get_single_case_ed(item1, item2) != get_single_case_ed(item2, item1)
     """
     def get_complementary_set(t):
         return [x for x in n.iterkeys() if x not in t]
     
     def intersection(t, s):
         return [tt for tt in t if tt in s]
     
     if not p.has_key(len(item1)):
         return 0
     else:
         ratio1 = p[len(item1)]
     
     if not p.has_key(len(item2)):
         return 0
     else:
         ratio2 = p[len(item2)]
         
     if d[len(item1)][len(item2)] == 0:
         denominator = 1
     else:
         denominator = d[len(item1)][len(item2)]
     
     if item1 == () and item2 == ():
         numerator = n[()] * (n[()] - 1)
     elif item1 == ():
         numerator = n[()] * Util.prod([n[x] for x in item2])
     elif item2 == ():
         numerator = Util.prod([n[x] for x in item1]) * n[()]
     else: 
         numerator = Util.prod([n[x] for x in item1]) * Util.prod([n[x] for x in intersection(item2, get_complementary_set(item1))]) * Util.prod([(n[x] - 1) for x in intersection(item1, item2)])
     
     delta = Util.distance(item1, item2)
     
     return ratio1 * ratio2 * numerator * delta / denominator
Esempio n. 8
0
 def setUpClass(cls):
     base_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "data")
     cls.data = Util.read_data(os.path.join(base_path, "comments.dat"))
     
     print
     print "{0}data{0}".format("*" * 6)
     for c, d in enumerate(cls.data):
         print "data{0}:{1}".format(c, d)
     print "{0}data{0}".format("*" * 6)
     print
Esempio n. 9
0
 def get_single_case_ed(self, item1, item2, p, n, d):
     """
     @param item1: multi-value item1
     @param item2: multi-value item2
     @param p: output of get_num_p
     @param n: output of get_single_item_n
     @param d: output of get_ways_of_pair
     
     @return: expected disagreement of item1 and item2 (note: this is not symmetric, i.e., get_single_case_ed(item1, item2) != get_single_case_ed(item2, item1)
     """
     def get_complementary_set(t):
         return [x for x in n.iterkeys() if x not in t]
     
     def intersection(t, s):
         return [tt for tt in t if tt in s]
     
     if not p.has_key(len(item1)):
         return 0
     else:
         ratio1 = p[len(item1)]
     
     if not p.has_key(len(item2)):
         return 0
     else:
         ratio2 = p[len(item2)]
         
     if d[len(item1)][len(item2)] == 0:
         denominator = 1
     else:
         denominator = d[len(item1)][len(item2)]
     
     if item1 == () and item2 == ():
         numerator = n[()] * (n[()] - 1)
     elif item1 == ():
         numerator = n[()] * Util.prod([n[x] for x in item2])
     elif item2 == ():
         numerator = Util.prod([n[x] for x in item1]) * n[()]
     else: 
         numerator = Util.prod([n[x] for x in item1]) * Util.prod([n[x] for x in intersection(item2, get_complementary_set(item1))]) * Util.prod([(n[x] - 1) for x in intersection(item1, item2)])
     
     delta = Util.distance(item1, item2)
     
     return ratio1 * ratio2 * numerator * delta / denominator
Esempio n. 10
0
    def __init__(self, data):
        '''
        @param data: a list of lists. Each sublist stands for the observations for a particular object
        '''

        assert isinstance(data, list) and isinstance(
            data[0], list) and isinstance(data[0][0], tuple) and isinstance(
                data[0][0][0], str)
        self.data = data
        self.flat_data = Util.flat_case_data(data)
Esempio n. 11
0
    def get_disagreement(self, ock):
        ndotdot = sum([
            sum([ock[f1][f2] for f2 in self.flat_data])
            for f1 in self.flat_data
        ])
        distanced_ock_sum = sum([
            sum([ock[f1][f2] * Util.distance(f1, f2) for f2 in self.flat_data])
            for f1 in self.flat_data
        ])

        return distanced_ock_sum / ndotdot
Esempio n. 12
0
 def test_read_data(self):
     print
     print "{0}testing read data{0}".format("*" * 6)
     data = Util.read_data(self.filepath)[0]
     print "data = {0}".format(data)
     self.assertEquals(data,
                       [[("Funk", "Rock"), ("Rock", )],
                        [("Funk", "Jazz", "Rock"), ("Funk", ),
                         ("Funk", "Rock")], [(), ("Jazz", ), ("Jazz", )]],
                       "data should be read as expected")
     flat_data = Util.flat_case_data(data)
     print "flat_data = {0}".format(flat_data)
     self.assertEquals(flat_data, [('Funk', 'Rock'), ('Rock', ),
                                   ('Funk', 'Jazz', 'Rock'), ('Funk', ), (),
                                   ('Jazz', )])
     data_items = Util.get_data_item(flat_data)
     print "data_items={0}".format(data_items)
     self.assertTrue(() in data_items,
                     "empty set should be included in unique single items")
     data_combination = Util.get_data_item_combination(data_items)
     print "data_combination = {0}".format(data_combination)
     print "{0}tested read data{0}".format("*" * 6)
     print
Esempio n. 13
0
 def test_distance(self):
     print
     print "{0}testing distance{0}".format("*" * 6)
     d1 = ()
     print "distance({{}}, {{}}) = {0}".format(Util.distance(d1, d1))
     self.assertEquals(Util.distance(d1, d1), 0, "distance({Empty}, {Empty}) = 0")
     d2 = ("Funk", "Rock")
     print "distance({{Funk}}, {{Rock}}) = {0}".format(Util.distance(d1, d2))
     self.assertEquals(Util.distance(d1, d2), 1.0, "distance({Empty}, {Rock, Funk}) = 1")
     
     d3 = ("Funk", "Rock")
     d4 = ("Funk", "Jazz", "Rock")
     print "distance({{Funk, Rock}}, {{Funk, Rock, Jazz}}) = {0}".format(Util.distance(d3, d4))      
     self.assertAlmostEqual(Util.distance(d3, d4), 0.2, 5, "distance({Funk, Rock}, {Funk, Rock, Jazz}) = 0.2")
     self.assertEquals(Util.distance(d3, d4), Util.distance(d4, d3), "distance is symmetric")
     print "{0}tested distance{0}".format("*" * 6)
     print
Esempio n. 14
0
 def create_eck(self, n):
     """
     @param n: get single data item frequency 
     """
     eck = {}.fromkeys(self.data_combinations)
     for key in self.data_combinations:
         eck[key] = {}
     
     data_combinations2 = self.data_combinations[:]
     for k1 in self.data_combinations:
         for k2 in data_combinations2:
             if k1 == () and k2 == ():
                 eck[k1][k2] = n[()] * (n[()] - 1)
             elif k1 == ():
                 eck[k1][k2] = n[()] * Util.prod([n[x] for x in k2])
             elif k2 == ():
                 eck[k1][k2] = Util.prod([n[x] for x in k1]) * n[()]
             else:
                 eck[k1][k2] = Util.prod([n[x] for x in k1]) *  Util.prod([(n[x] - 1) if x in k1 else n[x] for x in k2])
         data_combinations2.remove(k1)
         for k3 in data_combinations2:
             eck[k3][k1] = eck[k1][k3]
     
     return eck
Esempio n. 15
0
 def create_ock(self):
     ock = {}.fromkeys(self.flat_data)
     for key in ock.iterkeys():
         ock[key] = {}
     
     flat_data_2 = self.flat_data[:]
     for obs in self.flat_data:
         for obs2 in flat_data_2:
             ock[obs][obs2] = sum([
                                   Util.coincidence(d, obs, obs2) for d in self.data
                                   if obs in d and obs2 in d
                                   ])
         flat_data_2.remove(obs)
         for obs2 in flat_data_2:
             # copy the symmetric items
             ock[obs2][obs] = ock[obs][obs2]
     return ock
Esempio n. 16
0
    def create_ock(self):
        ock = {}.fromkeys(self.flat_data)
        for key in ock.iterkeys():
            ock[key] = {}

        flat_data_2 = self.flat_data[:]
        for obs in self.flat_data:
            for obs2 in flat_data_2:
                ock[obs][obs2] = sum([
                    Util.coincidence(d, obs, obs2) for d in self.data
                    if obs in d and obs2 in d
                ])
            flat_data_2.remove(obs)
            for obs2 in flat_data_2:
                # copy the symmetric items
                ock[obs2][obs] = ock[obs][obs2]
        return ock
Esempio n. 17
0
def main():
    if len(sys.argv) != 2:
        print "python Main %s"
        sys.exit(-1)
    
    data = Util.read_data(os.path.join(base_path, sys.argv[1]))
    
    for c, d in enumerate(data):
        od = ObservedDisagreement(d)
        ock = od.create_ock()
        od_value = od.get_disagreement(ock)
        
        ed = ExpectedDisagreement(od, ock)
        ed_value = ed.get()
    
        alpha = 1 - od_value / ed_value
        print "data[{0}] = {1}".format(c, d)
        print "alpha = {0}".format(alpha)
Esempio n. 18
0
def main():
    if len(sys.argv) != 2:
        print "python Main %s"
        sys.exit(-1)

    data = Util.read_data(os.path.join(base_path, sys.argv[1]))

    for c, d in enumerate(data):
        od = ObservedDisagreement(d)
        ock = od.create_ock()
        od_value = od.get_disagreement(ock)

        ed = ExpectedDisagreement(od, ock)
        ed_value = ed.get()

        alpha = 1 - od_value / ed_value
        print "data[{0}] = {1}".format(c, d)
        print "alpha = {0}".format(alpha)
Esempio n. 19
0
 def test_coincidence(self):
     print
     print "{0}testing coincidence{0}".format("*" * 6)
     data = [(), ('Jazz',), ('Jazz',)]
     self.assertEquals(Util.list_item_count(data, ('Jazz', )), 2, "Jazz occures for twice")
     self.assertEquals(Util.coincidence(data, (), ('Jazz',)), 1.0, "coincidence of {} and (Jazz,) is 1")
     self.assertEquals(Util.coincidence(data, ('Jazz', ), ('Jazz',)), 1.0, "coincidence of (Jazz,) and (Jazz,) is 1")
     
     data2 = [('Funk', 'Rock', 'Jazz'), ('Funk',), ('Funk', 'Rock')]
     self.assertEquals(Util.coincidence(data2, ('Funk', ), ('Funk', 'Rock')), 0.5, 'coincidence of (Funk,) and (Funk, Rock) is 0.5')
     self.assertEquals(Util.coincidence(data2, ('Funk', ), ('Funk', 'Rock')), Util.coincidence(data2, ('Funk', 'Rock'), ('Funk',)), 'coincidence is symmetric')
     print "{0}tested coincidence{0}".format("*" * 6)
     print
Esempio n. 20
0
    def test_distance(self):
        print
        print "{0}testing distance{0}".format("*" * 6)
        d1 = ()
        print "distance({{}}, {{}}) = {0}".format(Util.distance(d1, d1))
        self.assertEquals(Util.distance(d1, d1), 0,
                          "distance({Empty}, {Empty}) = 0")
        d2 = ("Funk", "Rock")
        print "distance({{Funk}}, {{Rock}}) = {0}".format(Util.distance(
            d1, d2))
        self.assertEquals(Util.distance(d1, d2), 1.0,
                          "distance({Empty}, {Rock, Funk}) = 1")

        d3 = ("Funk", "Rock")
        d4 = ("Funk", "Jazz", "Rock")
        print "distance({{Funk, Rock}}, {{Funk, Rock, Jazz}}) = {0}".format(
            Util.distance(d3, d4))
        self.assertAlmostEqual(
            Util.distance(d3, d4), 0.2, 5,
            "distance({Funk, Rock}, {Funk, Rock, Jazz}) = 0.2")
        self.assertEquals(Util.distance(d3, d4), Util.distance(d4, d3),
                          "distance is symmetric")
        print "{0}tested distance{0}".format("*" * 6)
        print
Esempio n. 21
0
    def test_coincidence(self):
        print
        print "{0}testing coincidence{0}".format("*" * 6)
        data = [(), ('Jazz', ), ('Jazz', )]
        self.assertEquals(Util.list_item_count(data, ('Jazz', )), 2,
                          "Jazz occures for twice")
        self.assertEquals(Util.coincidence(data, (), ('Jazz', )), 1.0,
                          "coincidence of {} and (Jazz,) is 1")
        self.assertEquals(Util.coincidence(data, ('Jazz', ), ('Jazz', )), 1.0,
                          "coincidence of (Jazz,) and (Jazz,) is 1")

        data2 = [('Funk', 'Rock', 'Jazz'), ('Funk', ), ('Funk', 'Rock')]
        self.assertEquals(
            Util.coincidence(data2, ('Funk', ), ('Funk', 'Rock')), 0.5,
            'coincidence of (Funk,) and (Funk, Rock) is 0.5')
        self.assertEquals(
            Util.coincidence(data2, ('Funk', ), ('Funk', 'Rock')),
            Util.coincidence(data2, ('Funk', 'Rock'), ('Funk', )),
            'coincidence is symmetric')
        print "{0}tested coincidence{0}".format("*" * 6)
        print
Esempio n. 22
0
        
        flat_data_2 = self.flat_data[:]
        for obs in self.flat_data:
            for obs2 in flat_data_2:
                ock[obs][obs2] = sum([
                                      Util.coincidence(d, obs, obs2) for d in self.data
                                      if obs in d and obs2 in d
                                      ])
            flat_data_2.remove(obs)
            for obs2 in flat_data_2:
                # copy the symmetric items
                ock[obs2][obs] = ock[obs][obs2]
        return ock

    def get_disagreement(self, ock):
        ndotdot = sum([sum([ock[f1][f2] for f2 in self.flat_data]) for f1 in self.flat_data])
        distanced_ock_sum = sum([sum([ock[f1][f2] * Util.distance(f1, f2) for f2 in self.flat_data]) for f1 in self.flat_data])
        
        return distanced_ock_sum / ndotdot
    
    def get(self):
        ock = self.create_ock()
        return self.get_disagreement(ock)

if __name__ == "__main__":
    import os
    
    data = Util.read_data(os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "data", "music.dat"))[0]
    od = ObservedDisagreement(data)
    
Esempio n. 23
0
 def __init__(self, observed_disagreement, ock):
     self.od = observed_disagreement
     self.ock = ock
     self.data_item = Util.get_data_item(self.od.flat_data)
     self.data_combinations = Util.get_data_item_combination(self.data_item)
Esempio n. 24
0
 def get_disagreement(self, ock):
     ndotdot = sum([sum([ock[f1][f2] for f2 in self.flat_data]) for f1 in self.flat_data])
     distanced_ock_sum = sum([sum([ock[f1][f2] * Util.distance(f1, f2) for f2 in self.flat_data]) for f1 in self.flat_data])
     
     return distanced_ock_sum / ndotdot
Esempio n. 25
0
 def __init__(self, observed_disagreement, ock):
     self.od = observed_disagreement
     self.ock = ock
     self.data_item = Util.get_data_item(self.od.flat_data)
     self.data_combinations = Util.get_data_item_combination(self.data_item)
 def setUpClass(cls):
     data = Util.read_data(os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "data", "music.dat"))[0]
     od = ObservedDisagreement(data)
     ock = od.create_ock()
     cls.ed = ExpectedDisagreement(od, ock)
Esempio n. 27
0
            for obs2 in flat_data_2:
                # copy the symmetric items
                ock[obs2][obs] = ock[obs][obs2]
        return ock

    def get_disagreement(self, ock):
        ndotdot = sum([
            sum([ock[f1][f2] for f2 in self.flat_data])
            for f1 in self.flat_data
        ])
        distanced_ock_sum = sum([
            sum([ock[f1][f2] * Util.distance(f1, f2) for f2 in self.flat_data])
            for f1 in self.flat_data
        ])

        return distanced_ock_sum / ndotdot

    def get(self):
        ock = self.create_ock()
        return self.get_disagreement(ock)


if __name__ == "__main__":
    import os

    data = Util.read_data(
        os.path.join(
            os.path.dirname(os.path.dirname(os.path.dirname(__file__))),
            "data", "music.dat"))[0]
    od = ObservedDisagreement(data)