def __init__(self, content=None): """ Build an analyzer Content can be a string or a dict with absolute frequencies like in: `` Analyzer({"a": 4, "b": 8, "c": 1}) `` """ self.counter = NormalizedCounter(content)
def most_common_test(): nc = NormalizedCounter({"a": 8, "b": 4, "c": 2}) most_common = nc.most_common() assert most_common == [ ("a", 8./14), ("b", 4./14), ("c", 2./14) ]
def absolute_counts_test(): nc = NormalizedCounter() nc.insert("aaabbc") absolute = nc.absolute_counts() assert absolute["a"] == 3 assert absolute["b"] == 2 assert absolute["c"] == 1 assert len(absolute) == 3
def contains_test(): nc = NormalizedCounter() nc.insert("argoaijrgaorigjabaneoiarneaorn") assert "5" not in nc assert "a" in nc del nc["a"] assert all( c in nc for c in nc )
def contains_test(): nc = NormalizedCounter() nc.insert("argoaijrgaorigjabaneoiarneaorn") assert "5" not in nc assert "a" in nc del nc["a"] assert all(c in nc for c in nc)
def score(self, content): """ Assigns a score to any string. The smaller, the more similar frequency distribution. \ 0 means that the frequency distributions of both the content and the analyzer are equal. :param content: the string to be scored. :returns: a float number """ new_counter = NormalizedCounter() new_counter.insert(content) return counter_distance(self.counter, new_counter)
def elements_test(): nc = NormalizedCounter() assert len(nc) == 0 assert len(list(nc.elements())) == 0 nc.insert("a" * 5) assert nc["a"] == 1.0 assert len(nc) == 1 assert len(list(nc.elements())) == 1 nc.insert("argaoergiajrg") assert sum(nc[key] for key in nc.elements()) == 1
def counters_sum_test(): nc1 = NormalizedCounter() nc2 = NormalizedCounter() nc1.insert("aaaac") nc2.insert("bbbc") nc3 = nc1 + nc2 assert nc3["b"] == 3. / 9 assert nc3["a"] == 4. / 9 assert nc3["c"] == 2. / 9
def basic_test(): nc = NormalizedCounter() nc.insert("abcd") for c in "abcd": assert nc[c] == 0.25 # let insert some more data nc.insert("aaaa") assert nc["a"] == 5.0/8 assert nc["e"] == 0 # delete the 'a's and check if everything is right del nc["a"] assert nc["a"] == 0 assert nc["b"] == 1./3
def elements_test(): nc = NormalizedCounter() assert len(nc) == 0 assert len(list(nc.elements())) == 0 nc.insert("a" * 5) assert nc["a"] == 1.0 assert len(nc) == 1 assert len(list(nc.elements())) == 1 nc.insert("argaoergiajrg") assert sum( nc[key] for key in nc.elements() ) == 1
def transformation_test(): nc = NormalizedCounter("aA") assert nc["a"] == 0.5 assert nc["A"] == 0.5 nc.transform(lambda s: s.upper()) assert nc["A"] == 1 def sample_transform(s): return "B" if s == "A" else s nc2 = NormalizedCounter("AAACCCB") nc2.transform(sample_transform) assert nc2["B"] == 4. / 7
def counters_sum_test(): nc1 = NormalizedCounter() nc2 = NormalizedCounter() nc1.insert("aaaac") nc2.insert("bbbc") nc3 = nc1 + nc2 assert nc3["b"] == 3./9 assert nc3["a"] == 4./9 assert nc3["c"] == 2./9
def transformation_test(): nc = NormalizedCounter("aA") assert nc["a"] == 0.5 assert nc["A"] == 0.5 nc.transform(lambda s: s.upper()) assert nc["A"] == 1 def sample_transform(s): return "B" if s == "A" else s nc2 = NormalizedCounter("AAACCCB") nc2.transform(sample_transform) assert nc2["B"] == 4./7
def basic_test(): nc = NormalizedCounter() nc.insert("abcd") for c in "abcd": assert nc[c] == 0.25 # let insert some more data nc.insert("aaaa") assert nc["a"] == 5.0 / 8 assert nc["e"] == 0 # delete the 'a's and check if everything is right del nc["a"] assert nc["a"] == 0 assert nc["b"] == 1. / 3
def counter_distance_test(): nc1 = NormalizedCounter() nc1.insert("a") nc2 = NormalizedCounter() nc2.insert("b") assert counter_distance(nc1, nc2) == 2 nc1.insert("aaa") # distance only depends on proportions assert counter_distance(nc1, nc2) == 2 nc1.insert("c") assert counter_distance(nc1, nc2) == 1 + 0.8**2 + 0.2**2 # distance is commutative nc1.insert("adairgaoergjaperogianrg") nc2.insert("agoaerbpaoibnabnaperioanerpgainergp") assert counter_distance(nc1, nc2) == counter_distance(nc2, nc1)
class Analyzer(object): """ The class that performs the analysis. You can feed an analyzer from different sources (strings, files... ) so that it extracts the target frequency distribution and ask it to score supplied content based on frequency similarity """ def __init__(self, content=None): """ Build an analyzer Content can be a string or a dict with absolute frequencies like in: `` Analyzer({"a": 4, "b": 8, "c": 1}) `` """ self.counter = NormalizedCounter(content) def feed(self, content): """ Feeds the analyzer with a string :param content: the string to be fed to the analyzer """ self.counter.insert(content) def feed_from_raw_file(self, filename): """ Feeds the analyzer with the content of a file Every character will be taken into account, including newline chars. :param filename: the path of the file that will be fed to the analyzer """ with open(filename) as f: content = f.read() self.feed(content) def score(self, content): """ Assigns a score to any string. The smaller, the more similar frequency distribution. \ 0 means that the frequency distributions of both the content and the analyzer are equal. :param content: the string to be scored. :returns: a float number """ new_counter = NormalizedCounter() new_counter.insert(content) return counter_distance(self.counter, new_counter) def choose_best(self, strings, n=1): """ Returns the n strings whose frequency distribution is most similar to the one fed to the analyzer. :param strings: an iterator with the strings where the Analyzer will looked for the best strings. :param n: an integer specifying the number of strings which will be returned. :returns: an iterable containing the ``n`` best strings sorted by frequency similarity """ scores = {string: self.score(string) for string in strings} return map( operator.itemgetter(0), heapq.nsmallest(n, scores.iteritems(), operator.itemgetter(1))) def serialize(self): """ Returns a json representation of the analyzer :returns: a string containing a json representation of the absolute frequencies the analyzer has been fed with.""" content = self.counter.absolute_counts() return json.dumps(content) def store(self, filename): """ Stores the json representation of the analyzer to a file """ with open(filename, "w") as f: f.write(self.serialize()) def load(self, filename): """ Loads a frequency distribution file and adds it to the current distribution """ with open(filename) as f: counter = NormalizedCounter(json.loads(f.read())) self.counter += counter def discard(self, chars): """ Removes the chars in chars from the counter :param chars: an interable consisting of the chars \ whose frequency will be set to 0 """ for char in chars: del self.counter[char] def transform_keys(self, transformation): """ Maps the keys to other new keys to get a new frequency distribution The relative frequency of keys that map to the same key will be added in order to get the new frequency distribution. :param transformation: a callable object that maps chars to chars""" self.counter.transform(transformation) def keys(self): """ Returns the characters whose frequency is greater than 0 """ return self.counter.elements() @classmethod def from_raw_file(self, filename): """ Returns an analyzer whose frequency distribution is read from the file content """ analyzer = Analyzer() analyzer.feed_from_raw_file(filename) return analyzer @classmethod def from_file(self, filename): """ Reads a frequency distribution from a JSON file as stored by store method """ analyzer = Analyzer() analyzer.load(filename) return analyzer
def iterable_test(): nc = NormalizedCounter() nc.insert("fgaijogarjgaorigjarogijarogiar!)") assert sum( nc[key] for key in nc ) == 1
def iterable_test(): nc = NormalizedCounter() nc.insert("fgaijogarjgaorigjarogijarogiar!)") assert sum(nc[key] for key in nc) == 1
def most_common_test(): nc = NormalizedCounter({"a": 8, "b": 4, "c": 2}) most_common = nc.most_common() assert most_common == [("a", 8. / 14), ("b", 4. / 14), ("c", 2. / 14)]
def dictionary_constructor_test(): nc = NormalizedCounter({"a": 4, "b": 3}) assert nc["a"] == 4. / 7 assert nc["b"] == 3. / 7
def load(self, filename): """ Loads a frequency distribution file and adds it to the current distribution """ with open(filename) as f: counter = NormalizedCounter(json.loads(f.read())) self.counter += counter