Ejemplo n.º 1
0
    def test_transform(self):
        """

        """

        feature1 = Feature([('bob', 3), ('joe', 1), ('bobert', 1)])
        feature2 = Feature([('bob', 3), ('jane', 1), ('fido', 1)])
        featureset = FeatureSet({'p1': feature1, 'p2': feature2})

        featureset_transformed = featureset.transform(
            lambda f, c, C, DC: c * 3)

        self.assertEqual(len(featureset_transformed.features), 2)

        expected = len(featureset_transformed.unique | feature2.unique)

        self.assertEqual(len(featureset_transformed.index), expected)
        self.assertEqual(len(featureset_transformed.lookup), expected)
        self.assertEqual(len(featureset_transformed.counts), expected)
        self.assertEqual(len(featureset_transformed.documentCounts), expected)
        self.assertEqual(len(featureset_transformed.unique), expected)

        self.assertEqual(featureset_transformed.documentCount('bob'), 2)

        self.assertEqual(featureset_transformed.count('bob'), 18)

        self.assertIn('p1', featureset_transformed.papers_containing('bob'))
        self.assertIn('p2', featureset_transformed.papers_containing('bob'))
Ejemplo n.º 2
0
    def test_init_features(self):
        """
        Initialize with multiple features.
        """

        logger.debug('FeatureSet should have 2 Features')
        feature1 = Feature([('bob', 3), ('joe', 1), ('bobert', 1)])
        feature2 = Feature([('bob', 3), ('jane', 1), ('fido', 1)])
        featureset = FeatureSet({'p1': feature1, 'p2': feature2})

        self.assertEqual(len(featureset.features), 2)

        expected = len(feature1.unique | feature2.unique)

        self.assertEqual(len(featureset.index), expected)
        self.assertEqual(len(featureset.lookup), expected)
        self.assertEqual(len(featureset.counts), expected)
        self.assertEqual(len(featureset.documentCounts), expected)
        self.assertEqual(len(featureset.unique), expected)

        self.assertEqual(featureset.documentCount('bob'), 2)

        self.assertEqual(featureset.count('bob'), 6)

        self.assertIn('p1', featureset.papers_containing('bob'))
        self.assertIn('p2', featureset.papers_containing('bob'))
Ejemplo n.º 3
0
 def test_empty_feature(self):
     feature1 = Feature([('bob', 3), ('joe', 1), ('bobert', 1)])
     feature2 = Feature([])
     try:
         featureset = FeatureSet({'p1': feature1, 'p2': feature2})
     except Exception as E:
         self.fail(E.message)
Ejemplo n.º 4
0
    def test_angular_similarity(self):
        feature = Feature([('bob', 3), ('joe', 1), ('bobert', 1)])
        feature2 = Feature([('blob', 3), ('joe', 1), ('brobert', 1)])
        feature3 = Feature([('blob', 1), ('joe', 2), ('brobert', 1)])

        c = angular_similarity(feature3, feature2)

        self.assertIsInstance(c, float)
        self.assertGreater(c, 0.)
Ejemplo n.º 5
0
    def test_iadd(self):
        feature = Feature([('bob', 3), ('joe', 1), ('bobert', 1)])

        feature += [('bob', 1)]
        self.assertEqual(feature.value('bob'), 4)

        feature += ['bob']
        self.assertEqual(feature.value('bob'), 5)

        feature += 'bob'
        self.assertEqual(feature.value('bob'), 6)
Ejemplo n.º 6
0
    def test_isub(self):
        feature = Feature([('bob', 3), ('joe', 1), ('bobert', 1)])

        feature -= [('bob', 1)]
        self.assertEqual(feature.value('bob'), 2)

        feature -= ['bob']
        self.assertEqual(feature.value('bob'), 1)

        feature -= 'bob'
        self.assertEqual(feature.value('bob'), 0)
Ejemplo n.º 7
0
    def test_as_matrix(self):
        featureset = FeatureSet()
        feature = Feature([('bob', 3), ('joe', 1), ('bobert', 1)])
        feature2 = Feature([('blob', 3), ('joe', 1), ('brobert', 1)])
        feature3 = Feature([('blob', 1), ('joe', 1), ('brobert', 1)])
        featureset.add('p1', feature)
        featureset.add('p2', feature2)
        featureset.add('p3', feature3)

        M = featureset.as_matrix()
        self.assertEqual(len(M), len(featureset))
        self.assertEqual(len(M[0]), len(featureset.unique))
Ejemplo n.º 8
0
    def test_kl_divergence(self):
        feature = Feature([('bob', 3), ('joe', 1), ('bobert', 1)])
        feature2 = Feature([('blob', 3), ('joe', 1), ('brobert', 1)])
        feature3 = Feature([('blob', 1), ('joe', 2), ('brobert', 1)])
        featureset = FeatureSet()
        featureset.add('p1', feature)
        featureset.add('p2', feature2)
        featureset.add('p3', feature3)

        V_a = featureset.as_vector('p2')
        V_b = featureset.as_vector('p3')
        k = kl_divergence(V_a, V_b)

        self.assertIsInstance(k, float)
        self.assertGreater(k, 0.)
Ejemplo n.º 9
0
    def test_init_datum(self):
        """
        Initialize with a single token.
        """
        feature = Feature('bob')

        self.assertEqual(len(feature), 1)
        self.assertEqual(feature[0], ('bob', 1))
Ejemplo n.º 10
0
    def test_init_counts(self):
        """
        Initialize with a list of 2-tuple token values.
        """
        feature = Feature([('bob', 3), ('joe', 1), ('bobert', 1)])

        self.assertEqual(len(feature), 3)
        self.assertEqual(dict(feature)['bob'], 3)
        self.assertEqual(dict(feature)['joe'], 1)
Ejemplo n.º 11
0
    def test_init_list(self):
        """
        Initialize with a list of tokens.
        """
        feature = Feature(['bob', 'joe', 'bob', 'bobert', 'bob'])

        self.assertEqual(len(feature), 3)
        self.assertEqual(dict(feature)['bob'], 3)
        self.assertEqual(dict(feature)['joe'], 1)
Ejemplo n.º 12
0
    def test_as_vector(self):
        featureset = FeatureSet()
        feature = Feature([('bob', 3), ('joe', 1), ('bobert', 1)])
        feature2 = Feature([('blob', 3), ('joe', 1), ('brobert', 1)])
        feature3 = Feature([('blob', 1), ('joe', 1), ('brobert', 1)])
        featureset.add('p1', feature)
        featureset.add('p2', feature2)
        featureset.add('p3', feature3)

        v = featureset.as_vector('p1')
        v_norm = featureset.as_vector('p1', norm=True)

        self.assertIsInstance(v, list)
        self.assertIsInstance(v_norm, list)
        self.assertEqual(len(v), len(v_norm))
        self.assertEqual(len(v), len(featureset.unique))
        self.assertGreater(sum(v), 0)
        self.assertGreater(sum(v_norm), 0)
        self.assertEqual(sum(v_norm), 1.0)
Ejemplo n.º 13
0
    def test_add_feature(self):
        """
        Initialize empty, then add a feature.
        """

        featureset = FeatureSet()
        feature = Feature([('bob', 3), ('joe', 1), ('bobert', 1)])
        featureset.add('p1', feature)

        self.assertEqual(len(featureset.features), 1)

        expected = len(feature.unique)

        self.assertEqual(len(featureset.index), expected)
        self.assertEqual(len(featureset.lookup), expected)
        self.assertEqual(len(featureset.counts), expected)
        self.assertEqual(len(featureset.documentCounts), expected)
        self.assertEqual(len(featureset.unique), expected)

        self.assertIn('p1', featureset.papers_containing('bob'))

        self.assertEqual(featureset.documentCount('bob'), 1)
        self.assertEqual(featureset.count('bob'), 3)

        # Do it again! There was some weirdness with the FeatureSet constructor.
        featureset = FeatureSet()
        feature = Feature([('bob', 3), ('joe', 1), ('bobert', 1)])
        featureset.add('p1', feature)

        self.assertEqual(len(featureset.features), 1)

        expected = len(feature.unique)

        self.assertEqual(len(featureset.index), expected)
        self.assertEqual(len(featureset.lookup), expected)
        self.assertEqual(len(featureset.counts), expected)
        self.assertEqual(len(featureset.documentCounts), expected)
        self.assertEqual(len(featureset.unique), expected)

        self.assertIn('p1', featureset.papers_containing('bob'))

        self.assertEqual(featureset.documentCount('bob'), 1)
        self.assertEqual(featureset.count('bob'), 3)
Ejemplo n.º 14
0
    def test_top(self):
        featureset = FeatureSet()
        feature = Feature([('bob', 3), ('joe', 1), ('bobert', 1)])
        feature2 = Feature([('blob', 3), ('joe', 1), ('brobert', 1)])
        feature3 = Feature([('blob', 1), ('joe', 1), ('brobert', 1)])
        featureset.add('p1', feature)
        featureset.add('p2', feature2)
        featureset.add('p3', feature3)

        N = 3
        top = featureset.top(N)
        self.assertIsInstance(top, list)
        self.assertIsInstance(top[0], tuple)
        self.assertEqual(len(top), N)
        self.assertSetEqual(set(list(zip(*top))[0]),
                            set(['blob', 'bob', 'joe']))

        top = featureset.top(N, by='documentCounts')
        self.assertIsInstance(top, list)
        self.assertIsInstance(top[0], tuple)
        self.assertEqual(len(top), N)
        self.assertSetEqual(set(list(zip(*top))[0]),
                            set(['blob', 'brobert', 'joe']))
Ejemplo n.º 15
0
    def test_extend(self):
        feature = Feature([('bob', 3), ('joe', 1), ('bobert', 1)])

        feature.extend([('bob', 1)])
        self.assertEqual(feature.value('bob'), 4)

        feature.extend(['bob'])
        self.assertEqual(feature.value('bob'), 5)

        feature.extend('bob')
        self.assertEqual(feature.value('bob'), 6)
Ejemplo n.º 16
0
 def test_norm(self):
     feature = Feature([('bob', 3), ('joe', 1), ('bobert', 1)])
     T = sum(list(zip(*feature))[1])
     for n, r in zip(list(zip(*feature.norm))[1], list(zip(*feature))[1]):
         self.assertEqual(n, float(r) / T)
Ejemplo n.º 17
0
    def test_init_tuples(self):
        feature = Feature([('bob', 'dole'), ('roy', 'snaydon')])

        self.assertEqual(len(feature), 2)
        self.assertEqual(dict(feature)[('bob', 'dole')], 1)