def counters_sum_test(): nc1 = NormalizedCounter() nc2 = NormalizedCounter() nc1.insert("aaaac") nc2.insert("bbbc") nc3 = nc1 + nc2 assert nc3["b"] == 3. / 9 assert nc3["a"] == 4. / 9 assert nc3["c"] == 2. / 9
def transformation_test(): nc = NormalizedCounter("aA") assert nc["a"] == 0.5 assert nc["A"] == 0.5 nc.transform(lambda s: s.upper()) assert nc["A"] == 1 def sample_transform(s): return "B" if s == "A" else s nc2 = NormalizedCounter("AAACCCB") nc2.transform(sample_transform) assert nc2["B"] == 4. / 7
def __init__(self, content=None): """ Build an analyzer Content can be a string or a dict with absolute frequencies like in: `` Analyzer({"a": 4, "b": 8, "c": 1}) `` """ self.counter = NormalizedCounter(content)
def absolute_counts_test(): nc = NormalizedCounter() nc.insert("aaabbc") absolute = nc.absolute_counts() assert absolute["a"] == 3 assert absolute["b"] == 2 assert absolute["c"] == 1 assert len(absolute) == 3
def contains_test(): nc = NormalizedCounter() nc.insert("argoaijrgaorigjabaneoiarneaorn") assert "5" not in nc assert "a" in nc del nc["a"] assert all(c in nc for c in nc)
def counter_distance_test(): nc1 = NormalizedCounter() nc1.insert("a") nc2 = NormalizedCounter() nc2.insert("b") assert counter_distance(nc1, nc2) == 2 nc1.insert("aaa") # distance only depends on proportions assert counter_distance(nc1, nc2) == 2 nc1.insert("c") assert counter_distance(nc1, nc2) == 1 + 0.8**2 + 0.2**2 # distance is commutative nc1.insert("adairgaoergjaperogianrg") nc2.insert("agoaerbpaoibnabnaperioanerpgainergp") assert counter_distance(nc1, nc2) == counter_distance(nc2, nc1)
def elements_test(): nc = NormalizedCounter() assert len(nc) == 0 assert len(list(nc.elements())) == 0 nc.insert("a" * 5) assert nc["a"] == 1.0 assert len(nc) == 1 assert len(list(nc.elements())) == 1 nc.insert("argaoergiajrg") assert sum(nc[key] for key in nc.elements()) == 1
def score(self, content): """ Assigns a score to any string. The smaller, the more similar frequency distribution. \ 0 means that the frequency distributions of both the content and the analyzer are equal. :param content: the string to be scored. :returns: a float number """ new_counter = NormalizedCounter() new_counter.insert(content) return counter_distance(self.counter, new_counter)
def basic_test(): nc = NormalizedCounter() nc.insert("abcd") for c in "abcd": assert nc[c] == 0.25 # let insert some more data nc.insert("aaaa") assert nc["a"] == 5.0 / 8 assert nc["e"] == 0 # delete the 'a's and check if everything is right del nc["a"] assert nc["a"] == 0 assert nc["b"] == 1. / 3
def iterable_test(): nc = NormalizedCounter() nc.insert("fgaijogarjgaorigjarogijarogiar!)") assert sum(nc[key] for key in nc) == 1
def most_common_test(): nc = NormalizedCounter({"a": 8, "b": 4, "c": 2}) most_common = nc.most_common() assert most_common == [("a", 8. / 14), ("b", 4. / 14), ("c", 2. / 14)]
def dictionary_constructor_test(): nc = NormalizedCounter({"a": 4, "b": 3}) assert nc["a"] == 4. / 7 assert nc["b"] == 3. / 7
def load(self, filename): """ Loads a frequency distribution file and adds it to the current distribution """ with open(filename) as f: counter = NormalizedCounter(json.loads(f.read())) self.counter += counter