Esempio n. 1
0
    def test_Q2_enron1(self):
        result = self.pretty_rdd(
            convert_to_weighted_network(
                extract_email_network(
                    self.utf8_decode_and_filter(
                        self.sc.sequenceFile(
                            '/user/ufac001/project1920/samples/enron1.seq')))))

        self.assertEqual(
            result, '''
        ('*****@*****.**', '*****@*****.**', 1)
        ('*****@*****.**', '*****@*****.**', 1)
        ('*****@*****.**', '*****@*****.**', 1)
        ('*****@*****.**', '*****@*****.**', 1)
        ('*****@*****.**', '*****@*****.**', 1)
        ('*****@*****.**', '*****@*****.**', 1)
        ''')
Esempio n. 2
0
    def test_Q4_1(self):
        result = self.pretty_rdd(
            get_out_degree_dist(
                convert_to_weighted_network(
                    extract_email_network(
                        self.utf8_decode_and_filter(
                            self.sc.sequenceFile(
                                '/user/ufac001/project1920/samples/enron20.seq'
                            ))))))

        self.assertEqual(
            result, '''
        (0, 14)
        (1, 5)
        (2, 3)
        (5, 1)
        (8, 1)
        (26, 1)
        ''')
Esempio n. 3
0
    def test_Q4_2(self):
        result = self.pretty_rdd(
            get_in_degree_dist(
                convert_to_weighted_network(
                    extract_email_network(
                        self.utf8_decode_and_filter(
                            self.sc.sequenceFile(
                                '/user/ufac001/project1920/samples/enron20.seq'
                            ))))))

        self.assertEqual(
            result, '''
        (0, 8)
        (1, 10)
        (2, 1)
        (3, 2)
        (4, 2)
        (9, 1)
        (15, 1)
        ''')
Esempio n. 4
0
    def test_Q2_enron20(self):
        result = self.pretty_rdd(
            convert_to_weighted_network(
                extract_email_network(
                    self.utf8_decode_and_filter(
                        self.sc.sequenceFile(
                            '/user/ufac001/project1920/samples/enron20.seq'))),
                (datetime(2000, 10, 1, tzinfo=timezone.utc),
                 datetime(2001, 9, 1, tzinfo=timezone.utc))))

        self.assertEqual(
            result, '''
        ('*****@*****.**', '*****@*****.**', 1)
        ('*****@*****.**', '*****@*****.**', 1)
        ('*****@*****.**', '*****@*****.**', 1)
        ('*****@*****.**', '*****@*****.**', 2)
        ('*****@*****.**', '*****@*****.**', 2)
        ('*****@*****.**', '*****@*****.**', 2)
        ('*****@*****.**', '*****@*****.**', 2)
        ('*****@*****.**', '*****@*****.**', 1)
        ''')
Esempio n. 5
0
    def test_Q3_2(self):
        result = self.pretty_rdd(
            get_in_degrees(
                convert_to_weighted_network(
                    extract_email_network(
                        self.utf8_decode_and_filter(
                            self.sc.sequenceFile(
                                '/user/ufac001/project1920/samples/enron20.seq'
                            ))))))

        self.assertEqual(
            result, '''
        (15, '*****@*****.**')
        (9, '*****@*****.**')
        (4, '*****@*****.**')
        (4, '*****@*****.**')
        (3, '*****@*****.**')
        (3, '*****@*****.**')
        (2, '*****@*****.**')
        (1, '*****@*****.**')
        (1, '*****@*****.**')
        (1, '*****@*****.**')
        (1, '*****@*****.**')
        (1, '*****@*****.**')
        (1, '*****@*****.**')
        (1, '*****@*****.**')
        (1, '*****@*****.**')
        (1, '*****@*****.**')
        (1, '*****@*****.**')
        (0, '*****@*****.**')
        (0, '*****@*****.**')
        (0, '*****@*****.**')
        (0, '*****@*****.**')
        (0, '*****@*****.**')
        (0, '*****@*****.**')
        (0, '*****@*****.**')
        (0, '*****@*****.**')
        ''')
# datetime.datetime(2000, 7, 31, 5, 48, 
# tzinfo=datetime.timezone(datetime.timedelta(-1, 61200))))
# ('*****@*****.**', '*****@*****.**', 
# datetime.datetime(2000, 7, 31, 5, 48, 
# tzinfo=datetime.timezone(datetime.timedelta(-1, 61200))))
# ('*****@*****.**', '*****@*****.**', 
# datetime.datetime(2000, 7, 31, 5, 48, 
# tzinfo=datetime.timezone(datetime.timedelta(-1, 61200))))
# ('*****@*****.**', '*****@*****.**', 
# datetime.datetime(2000, 7, 31, 5, 48, 
# tzinfo=datetime.timezone(datetime.timedelta(-1, 61200))))
    

    print(pretty_rdd(convert_to_weighted_network(
        extract_email_network(
        utf8_decode_and_filter(sc.sequenceFile(
                'sample-datasets/enron20.seq'))), 
                (datetime(2000, 10, 1, tzinfo = timezone.utc), 
                 datetime(2001, 9, 1, tzinfo = timezone.utc)))))
# Expected output:
# ('*****@*****.**', '*****@*****.**', 1)
# ('*****@*****.**', '*****@*****.**', 1)
# ('*****@*****.**', '*****@*****.**', 1)
# ('*****@*****.**', '*****@*****.**', 2)
# ('*****@*****.**', '*****@*****.**', 2)
# ('*****@*****.**', '*****@*****.**', 2)
# ('*****@*****.**', '*****@*****.**', 2)
# ('*****@*****.**', '*****@*****.**', 1)

    
    print(pretty_rdd(convert_to_weighted_network(
        extract_email_network(