def test_ratio(self): """ Test the ratio computation """ dist = FreqDist('aaabbbaaabccddeeffbbccddeegjja') self.assertAlmostEqual(dist.ratio('a', 'b'), 1.16666667) self.assertAlmostEqual(dist.ratio('b', 'a'), 0.85714285)
def extract_graph(self): """ Extracts a Graph where the nodes are EmailAddress """ links = FreqDist() for email in self.extract(): people = [ email.sender, ] people.extend(email.recipients) people.extend(email.copied) people = filter(lambda p: p is not None, people) # Filter out any None addresses people = set(addr.email for addr in people if addr.email) # Obtain only unique people people = sorted(people) # Sort lexicographically for combinations for combo in combinations(people, 2): links[combo] += 1 G = nx.Graph(name="Email Network", mbox=self.path, extracted=strfnow()) for link in links.keys(): G.add_edge(*link, weight=links.freq(link)) return G
def test_str(self): """ Test the stringification of the frequency distribution """ try: dist = FreqDist(random_characters(1000, 'abc')) s = str(dist) r = repr(dist) p = dist.pprint() except Exception as e: self.fail("Stringifcation failed: {}".format(e))
def test_inverse_ratio(self): """ Test that the ratio is correct for the inverse """ dist = FreqDist(random_characters(1000, 'abc')) rtab = dist.ratio('a', 'b') rtba = dist.ratio('b', 'a') riab = 1.0 / rtab riba = 1.0 / rtba self.assertAlmostEqual(riab, rtba) self.assertAlmostEqual(riba, rtab)
def test_freq(self): """ Test the computation of the frequency """ samples = list(random_characters(90, 'abc')) samples.extend(['d'] * 10) dist = FreqDist(samples) self.assertEqual(dist.N, len(samples)) self.assertEqual(dist.B, 4) self.assertAlmostEqual(dist.freq('d'), 0.1) for c in 'abc': self.assertGreater(dist.freq(c), 0.0) self.assertLess(dist.freq(c), 1.0)
def test_dump_and_load(self): """ Test the serialization of frequency distribution """ fobj = StringIO() orig = FreqDist(random_characters(1000)) # Dump the frequncy distribution to the stream orig.dump(fobj) # Seek to 0 and load the frequency distribution fobj.seek(0) dist = FreqDist.load(fobj) self.assertEqual(orig, dist)
def header_analysis(self): """ Performs an analysis of the frequency of headers in the Mbox """ headers = FreqDist() for msg in self: headers['X-Tribe-Message-Count'] += 1 for key in msg.keys(): headers[key] += 1 return headers
def test_memoized_n_samples(self): """ Test the memoization of N, the number of samples """ dist = FreqDist(random_characters(100)) self.assertEqual(dist.N, 100) for letter in random_characters(100): dist[letter] += 1 self.assertEqual(dist.N, 100) del dist.N self.assertEqual(dist.N, 200)
def test_memoized_b_bins(self): """ Test the memoization of B, the number of bins """ dist = FreqDist(random_characters(1000)) self.assertEqual(dist.B, 26) for letter in random_characters(100, 'abcdef'): dist[letter] += 1 self.assertEqual(dist.B, 26) del dist.B self.assertEqual(dist.B, 32)
def test_memoized_m_magnitude(self): """ Test the memoization of M, the magnitude """ dist = FreqDist('aaabbbaaabccddeeffbbccddeegjja') self.assertEqual(dist.M, 7) for letter in 'aaabbccc': dist[letter] += 1 self.assertEqual(dist.M, 7) del dist.M self.assertEqual(dist.M, 10)
def test_norm(self): """ Test the computation of the norm """ samples = list(random_characters(50, 'abc')) samples.extend(['d'] * 50) dist = FreqDist(samples) self.assertEqual(dist.max(), 'd') self.assertEqual(dist.N, len(samples)) self.assertEqual(dist.M, 50) self.assertAlmostEqual(dist.norm('d'), 1.0) for c in 'abc': self.assertGreater(dist.norm(c), 0.0) self.assertLess(dist.norm(c), 1.0)
def test_m_magnitude(self): """ Test the computation of M, the magnitude """ dist = FreqDist('aaabbbaaabccddeeffbbccddeegjja') self.assertEqual(dist.M, 7)
def __init__(self, path): self.path = path self.mbox = mbox(path) # Track errors through extraction process self.errors = FreqDist()
def test_max(self): """ Test maximal element selection """ dist = FreqDist('aaabbbaaabccddeeffbbccddeegjja') self.assertEqual(dist.max(), 'a')
def test_empty_max(self): """ Test the frequency of an empty distribution """ dist = FreqDist() self.assertIsNone(dist.max())
def extract_graph(self): """ Extracts a Graph where the nodes are EmailAddress """ def relationships(email): """ Inner function that constructs email relationships """ people = [ email.sender, ] people.extend(email.recipients) people.extend(email.copied) people = filter(lambda p: p is not None, people) # Filter out any None addresses people = set(addr.email for addr in people if addr.email) # Obtain only unique people people = sorted(people) # Sort lexicographically for combinations for combo in combinations(people, 2): yield combo # Keep track of all the email to email links links = FreqDist() emails = 0 # Iterate over all the extracted emails # Catch exceptions, if any, and move forward # NOTE: This will allow the progress bar to work # NOTE: This will build the graph data structure in memory for email in self.extract(): emails += 1 try: for combo in relationships(email): links[combo] += 1 except Exception as e: self.errors[e] += 1 continue # Construct the networkx graph with details about generation. G = nx.Graph( name="Email Network", mbox=self.path, extracted=strfnow(), n_emails=emails, mbox_size=filesize(self.path), ) # Add edges to the graph with various weight properties from counts. # NOTE: memoization is used here in the FreqDist to speed things up. for link in links.keys(): link_data = { "weight": links.freq(link), "count": links[link], "norm": links.norm(link), } G.add_edge(*link, **link_data) # Return the generated graph return G
def test_b_bins(self): """ Test the computation of B, the number of bins """ dist = FreqDist(random_characters(1000)) self.assertEqual(dist.B, 26)
def test_empty_freq(self): """ Test the frequency of an empty distribution """ dist = FreqDist() self.assertEqual(dist.freq('a'), 0)
def test_n_samples(self): """ Test the computation of N, the number of samples """ dist = FreqDist(random_characters(100)) self.assertEqual(dist.N, 100)
def test_empty_norm(self): """ Test the norm of an empty distribution """ dist = FreqDist() self.assertEqual(dist.norm('a'), 0)
def test_missing_ratio(self): """ Test that ratio of an unseen element is 0 """ dist = FreqDist(random_characters(100, 'abc')) self.assertEqual(dist.ratio('a', 'd'), 0)