Exemple #1
0
	def test_transform(self):
		dist = StatsCounter({
		    'of': 0.20, 
		    'the': 0.50, 
		    'that': 0.10, 
		    'from': 0.20
		})
		
		dist = dist.transform(lambda word, prob: word.startswith('t'))
		
		assert dist == StatsCounter({True: 0.6, False: 0.4})
Exemple #2
0
	def test_normalize(self):
		pdist = StatsCounter({1: 1, 2: 2, 3: 1}).normalize()
		assert pdist == {
			1: 0.25,
			2: 0.50,
			3: 0.25,
		}
    def _get_revision_word_dist(self, page_title, revid):
        """"""
        revids_to_word_dist = self.ctitle_to_revids_to_word_dist[page_title]

        if revid in revids_to_word_dist:
            return revids_to_word_dist[revid]

        text = self._get_revision_text(page_title, revid)

        text = [word.lower() for word in wordpunct_tokenize(text)
                if word.lower() not in STOPWORDS and word.lower() not in PUNCTUATION]

        pdist = StatsCounter(text).normalize()

        revids_to_word_dist[revid] = pdist

        return pdist
Exemple #4
0
def test_rank(etree):
    k = measure(etree.xpath('//body//*/..'))
    rv = rank(k)

    assert list(rv)[0] == (etree.xpath('//article')[0],
                           StatsCounter(['div'] * 9))
Exemple #5
0
def test_measure(etree):
    rv = measure(etree.xpath('//div/..'))
    uv = [(node.tag, metric) for node, metric in rv]

    assert uv == [('article', StatsCounter(['div'] * 9))]
Exemple #6
0
 def __init__(self, *args, **kwargs):
     StatsCounter.__init__(self, *args, **kwargs)
Exemple #7
0
	def test_get_weighted_random_value(self):
		wrv = StatsCounter(a=10, b=3).get_weighted_random_value()
		assert wrv == "a" or "b"
Exemple #8
0
class TestStatsCounter:
	counter_ints = StatsCounter({str(s):s for s in range(1000)})

	def test_mean_int(self):
		m = self.counter_ints.mean()
		d = 499500/1000
		assert m == d

	def test_median_low(self):
		m = self.counter_ints.median_low()
		assert m == 499

	def test_median_high(self, ):
		m = self.counter_ints.median_high()
		assert m == 500

	def test_median_grouped(self, ):
		m = self.counter_ints.median_grouped()
		assert m == 499.5

	def test_mode(self):
		with raises(stats.StatisticsError):
			self.counter_ints.mode()

	def test_variance(self):
		m = self.counter_ints.variance()
		assert m == 83416.66666666667

	def test_stdev(self, ):
		m = self.counter_ints.stdev()
		assert m == 288.8194360957494

	def test_pvariance(self):
		m = self.counter_ints.pvariance()
		assert m == 83333.25

	def test_pstdev(self, ):
		m = self.counter_ints.pstdev()
		assert m == 288.6749902572095

	def test_argmax(self):
		m = self.counter_ints.argmax()
		assert m == '999'

	def test_max(self):
		m = self.counter_ints.max()
		assert m == 999

	def test_normalize(self):
		pdist = StatsCounter({1: 1, 2: 2, 3: 1}).normalize()
		assert pdist == {
			1: 0.25,
			2: 0.50,
			3: 0.25,
		}
		
	def test_get_weighted_random_value(self):
		wrv = StatsCounter(a=10, b=3).get_weighted_random_value()
		assert wrv == "a" or "b"
		
	def test_transform(self):
		dist = StatsCounter({
		    'of': 0.20, 
		    'the': 0.50, 
		    'that': 0.10, 
		    'from': 0.20
		})
		
		dist = dist.transform(lambda word, prob: word.startswith('t'))
		
		assert dist == StatsCounter({True: 0.6, False: 0.4})
Exemple #9
0
def measure(nodes):
    return [(node, StatsCounter([child.tag for child in node]))
            for node in nodes]