Example #1
0
    def __init__(self, content=None):
        """ Build an analyzer

		Content can be a string or a dict with absolute frequencies like in:
		`` Analyzer({"a": 4, "b": 8, "c": 1}) ``
		"""
        self.counter = NormalizedCounter(content)
def most_common_test():
	nc = NormalizedCounter({"a": 8, "b": 4, "c": 2})

	most_common = nc.most_common()

	assert most_common == [
		("a", 8./14),
		("b", 4./14),
		("c", 2./14)
	]
def absolute_counts_test():
	nc = NormalizedCounter()
	nc.insert("aaabbc")

	absolute = nc.absolute_counts()

	assert absolute["a"] == 3
	assert absolute["b"] == 2
	assert absolute["c"] == 1
	assert len(absolute) == 3
def absolute_counts_test():
    nc = NormalizedCounter()
    nc.insert("aaabbc")

    absolute = nc.absolute_counts()

    assert absolute["a"] == 3
    assert absolute["b"] == 2
    assert absolute["c"] == 1
    assert len(absolute) == 3
def contains_test():
	nc = NormalizedCounter()

	nc.insert("argoaijrgaorigjabaneoiarneaorn")

	assert "5" not in nc
	assert "a" in nc

	del nc["a"]

	assert all( c in nc for c in nc ) 
def contains_test():
    nc = NormalizedCounter()

    nc.insert("argoaijrgaorigjabaneoiarneaorn")

    assert "5" not in nc
    assert "a" in nc

    del nc["a"]

    assert all(c in nc for c in nc)
Example #7
0
    def score(self, content):
        """
		Assigns a score to any string. The smaller, the more similar frequency distribution. \
		0 means that the frequency distributions of both the content and the analyzer are equal.

		:param content: the string to be scored.

		:returns: a float number

		"""
        new_counter = NormalizedCounter()
        new_counter.insert(content)

        return counter_distance(self.counter, new_counter)
def elements_test():
    nc = NormalizedCounter()

    assert len(nc) == 0
    assert len(list(nc.elements())) == 0

    nc.insert("a" * 5)

    assert nc["a"] == 1.0
    assert len(nc) == 1
    assert len(list(nc.elements())) == 1

    nc.insert("argaoergiajrg")
    assert sum(nc[key] for key in nc.elements()) == 1
def counters_sum_test():
    nc1 = NormalizedCounter()
    nc2 = NormalizedCounter()

    nc1.insert("aaaac")
    nc2.insert("bbbc")

    nc3 = nc1 + nc2

    assert nc3["b"] == 3. / 9
    assert nc3["a"] == 4. / 9
    assert nc3["c"] == 2. / 9
def basic_test():
	nc = NormalizedCounter()

	nc.insert("abcd")

	for c in "abcd":
		assert nc[c] == 0.25

	# let insert some more data
	nc.insert("aaaa")

	assert nc["a"] == 5.0/8
	assert nc["e"] == 0

	# delete the 'a's and check if everything is right
	del nc["a"]

	assert nc["a"] == 0
	assert nc["b"] == 1./3
def elements_test():
	nc = NormalizedCounter()

	assert len(nc) == 0
	assert len(list(nc.elements())) == 0

	nc.insert("a" * 5)

	assert nc["a"] == 1.0
	assert len(nc) == 1
	assert len(list(nc.elements())) == 1

	nc.insert("argaoergiajrg")
	assert sum( nc[key] for key in nc.elements() ) == 1
def transformation_test():
    nc = NormalizedCounter("aA")

    assert nc["a"] == 0.5
    assert nc["A"] == 0.5

    nc.transform(lambda s: s.upper())
    assert nc["A"] == 1

    def sample_transform(s):
        return "B" if s == "A" else s

    nc2 = NormalizedCounter("AAACCCB")
    nc2.transform(sample_transform)

    assert nc2["B"] == 4. / 7
def counters_sum_test():
	nc1 = NormalizedCounter()
	nc2 = NormalizedCounter()

	nc1.insert("aaaac")
	nc2.insert("bbbc")

	nc3 = nc1 + nc2

	assert nc3["b"] == 3./9
	assert nc3["a"] == 4./9
	assert nc3["c"] == 2./9
def transformation_test():
	nc = NormalizedCounter("aA")

	assert nc["a"] == 0.5
	assert nc["A"] == 0.5

	nc.transform(lambda s: s.upper())
	assert nc["A"] == 1

	

	def sample_transform(s):
		return "B" if s == "A" else s

	nc2 = NormalizedCounter("AAACCCB")
	nc2.transform(sample_transform)

	assert nc2["B"] == 4./7
def basic_test():
    nc = NormalizedCounter()

    nc.insert("abcd")

    for c in "abcd":
        assert nc[c] == 0.25

    # let insert some more data
    nc.insert("aaaa")

    assert nc["a"] == 5.0 / 8
    assert nc["e"] == 0

    # delete the 'a's and check if everything is right
    del nc["a"]

    assert nc["a"] == 0
    assert nc["b"] == 1. / 3
Example #16
0
def counter_distance_test():
    nc1 = NormalizedCounter()
    nc1.insert("a")

    nc2 = NormalizedCounter()
    nc2.insert("b")

    assert counter_distance(nc1, nc2) == 2

    nc1.insert("aaa")

    # distance only depends on proportions
    assert counter_distance(nc1, nc2) == 2

    nc1.insert("c")
    assert counter_distance(nc1, nc2) == 1 + 0.8**2 + 0.2**2

    # distance is commutative
    nc1.insert("adairgaoergjaperogianrg")
    nc2.insert("agoaerbpaoibnabnaperioanerpgainergp")

    assert counter_distance(nc1, nc2) == counter_distance(nc2, nc1)
Example #17
0
class Analyzer(object):
    """ 
	The class that performs the analysis. 
	You can feed an analyzer from different sources (strings, files... ) so that
	it extracts the target frequency distribution and ask
	it to score supplied content based on frequency similarity
	"""
    def __init__(self, content=None):
        """ Build an analyzer

		Content can be a string or a dict with absolute frequencies like in:
		`` Analyzer({"a": 4, "b": 8, "c": 1}) ``
		"""
        self.counter = NormalizedCounter(content)

    def feed(self, content):
        """ Feeds the analyzer with a string

		:param content: the string to be fed to the analyzer 
		"""
        self.counter.insert(content)

    def feed_from_raw_file(self, filename):
        """ Feeds the analyzer with the content of a file
			Every character will be taken into account, including newline chars.

			:param filename: the path of the file that will be fed to the analyzer
		"""
        with open(filename) as f:
            content = f.read()
            self.feed(content)

    def score(self, content):
        """
		Assigns a score to any string. The smaller, the more similar frequency distribution. \
		0 means that the frequency distributions of both the content and the analyzer are equal.

		:param content: the string to be scored.

		:returns: a float number

		"""
        new_counter = NormalizedCounter()
        new_counter.insert(content)

        return counter_distance(self.counter, new_counter)

    def choose_best(self, strings, n=1):
        """ 
		Returns the n strings whose frequency distribution is most similar
		to the one fed to the analyzer.

		:param strings: an iterator with the strings where the Analyzer will looked for the 
			best strings.
		:param n: an integer specifying the number of strings which will be returned.

		:returns: an iterable containing the ``n`` best strings sorted by frequency similarity

		"""
        scores = {string: self.score(string) for string in strings}

        return map(
            operator.itemgetter(0),
            heapq.nsmallest(n, scores.iteritems(), operator.itemgetter(1)))

    def serialize(self):
        """ Returns a json representation of the analyzer 

		:returns: a string containing a json representation of the absolute frequencies
			the analyzer has been fed with."""
        content = self.counter.absolute_counts()

        return json.dumps(content)

    def store(self, filename):
        """ Stores the json representation of the analyzer to a file """
        with open(filename, "w") as f:
            f.write(self.serialize())

    def load(self, filename):
        """ Loads a frequency distribution file and adds it to the current distribution """
        with open(filename) as f:
            counter = NormalizedCounter(json.loads(f.read()))
            self.counter += counter

    def discard(self, chars):
        """ Removes the chars in chars from the counter

			:param chars: an interable consisting of the chars \
				whose frequency will be set to 0

		"""
        for char in chars:
            del self.counter[char]

    def transform_keys(self, transformation):
        """ Maps the keys to other new keys to get a new frequency distribution


			The relative frequency of keys that map to the same key will be added in 
			order to get the new frequency distribution.

			:param transformation: a callable object that maps chars to chars"""
        self.counter.transform(transformation)

    def keys(self):
        """ Returns the characters whose frequency is greater than 0 """
        return self.counter.elements()

    @classmethod
    def from_raw_file(self, filename):
        """ Returns an analyzer whose frequency distribution is read from the file content """
        analyzer = Analyzer()
        analyzer.feed_from_raw_file(filename)

        return analyzer

    @classmethod
    def from_file(self, filename):
        """ Reads a frequency distribution from a JSON file as stored by store method """
        analyzer = Analyzer()
        analyzer.load(filename)
        return analyzer
def iterable_test():
	nc = NormalizedCounter()
	nc.insert("fgaijogarjgaorigjarogijarogiar!)")

	assert sum( nc[key] for key in nc ) == 1
def iterable_test():
    nc = NormalizedCounter()
    nc.insert("fgaijogarjgaorigjarogijarogiar!)")

    assert sum(nc[key] for key in nc) == 1
def most_common_test():
    nc = NormalizedCounter({"a": 8, "b": 4, "c": 2})

    most_common = nc.most_common()

    assert most_common == [("a", 8. / 14), ("b", 4. / 14), ("c", 2. / 14)]
Example #21
0
def counter_distance_test():
	nc1 = NormalizedCounter()
	nc1.insert("a")

	nc2 = NormalizedCounter()
	nc2.insert("b")

	assert counter_distance(nc1, nc2) == 2

	nc1.insert("aaa")

	# distance only depends on proportions
	assert counter_distance(nc1, nc2) == 2

	nc1.insert("c")
	assert counter_distance(nc1, nc2) == 1 + 0.8**2 + 0.2**2


	# distance is commutative
	nc1.insert("adairgaoergjaperogianrg")
	nc2.insert("agoaerbpaoibnabnaperioanerpgainergp")

	assert counter_distance(nc1, nc2) == counter_distance(nc2, nc1)
def dictionary_constructor_test():
    nc = NormalizedCounter({"a": 4, "b": 3})

    assert nc["a"] == 4. / 7
    assert nc["b"] == 3. / 7
Example #23
0
 def load(self, filename):
     """ Loads a frequency distribution file and adds it to the current distribution """
     with open(filename) as f:
         counter = NormalizedCounter(json.loads(f.read()))
         self.counter += counter