def test_Q2_enron1(self): result = self.pretty_rdd( convert_to_weighted_network( extract_email_network( self.utf8_decode_and_filter( self.sc.sequenceFile( '/user/ufac001/project1920/samples/enron1.seq'))))) self.assertEqual( result, ''' ('*****@*****.**', '*****@*****.**', 1) ('*****@*****.**', '*****@*****.**', 1) ('*****@*****.**', '*****@*****.**', 1) ('*****@*****.**', '*****@*****.**', 1) ('*****@*****.**', '*****@*****.**', 1) ('*****@*****.**', '*****@*****.**', 1) ''')
def test_Q4_1(self): result = self.pretty_rdd( get_out_degree_dist( convert_to_weighted_network( extract_email_network( self.utf8_decode_and_filter( self.sc.sequenceFile( '/user/ufac001/project1920/samples/enron20.seq' )))))) self.assertEqual( result, ''' (0, 14) (1, 5) (2, 3) (5, 1) (8, 1) (26, 1) ''')
def test_Q4_2(self): result = self.pretty_rdd( get_in_degree_dist( convert_to_weighted_network( extract_email_network( self.utf8_decode_and_filter( self.sc.sequenceFile( '/user/ufac001/project1920/samples/enron20.seq' )))))) self.assertEqual( result, ''' (0, 8) (1, 10) (2, 1) (3, 2) (4, 2) (9, 1) (15, 1) ''')
def test_Q2_enron20(self): result = self.pretty_rdd( convert_to_weighted_network( extract_email_network( self.utf8_decode_and_filter( self.sc.sequenceFile( '/user/ufac001/project1920/samples/enron20.seq'))), (datetime(2000, 10, 1, tzinfo=timezone.utc), datetime(2001, 9, 1, tzinfo=timezone.utc)))) self.assertEqual( result, ''' ('*****@*****.**', '*****@*****.**', 1) ('*****@*****.**', '*****@*****.**', 1) ('*****@*****.**', '*****@*****.**', 1) ('*****@*****.**', '*****@*****.**', 2) ('*****@*****.**', '*****@*****.**', 2) ('*****@*****.**', '*****@*****.**', 2) ('*****@*****.**', '*****@*****.**', 2) ('*****@*****.**', '*****@*****.**', 1) ''')
def test_Q3_2(self): result = self.pretty_rdd( get_in_degrees( convert_to_weighted_network( extract_email_network( self.utf8_decode_and_filter( self.sc.sequenceFile( '/user/ufac001/project1920/samples/enron20.seq' )))))) self.assertEqual( result
# datetime.datetime(2000, 7, 31, 5, 48, # tzinfo=datetime.timezone(datetime.timedelta(-1, 61200)))) # ('*****@*****.**', '*****@*****.**', # datetime.datetime(2000, 7, 31, 5, 48, # tzinfo=datetime.timezone(datetime.timedelta(-1, 61200)))) # ('*****@*****.**', '*****@*****.**', # datetime.datetime(2000, 7, 31, 5, 48, # tzinfo=datetime.timezone(datetime.timedelta(-1, 61200)))) # ('*****@*****.**', '*****@*****.**', # datetime.datetime(2000, 7, 31, 5, 48, # tzinfo=datetime.timezone(datetime.timedelta(-1, 61200)))) print(pretty_rdd(convert_to_weighted_network( extract_email_network( utf8_decode_and_filter(sc.sequenceFile( 'sample-datasets/enron20.seq'))), (datetime(2000, 10, 1, tzinfo = timezone.utc), datetime(2001, 9, 1, tzinfo = timezone.utc))))) # Expected output: # ('*****@*****.**', '*****@*****.**', 1) # ('*****@*****.**', '*****@*****.**', 1) # ('*****@*****.**', '*****@*****.**', 1) # ('*****@*****.**', '*****@*****.**', 2) # ('*****@*****.**', '*****@*****.**', 2) # ('*****@*****.**', '*****@*****.**', 2) # ('*****@*****.**', '*****@*****.**', 2) # ('*****@*****.**', '*****@*****.**', 1) print(pretty_rdd(convert_to_weighted_network( extract_email_network(