Example #1
0
    def test_Q1(self):
        # result = self.utf8_decode_and_filter(self.sc.sequenceFile(
        #         '/user/ufac001/project1920/samples/enron1.seq'))
        result = self.pretty_rdd(
            extract_email_network(
                self.utf8_decode_and_filter(
                    self.sc.sequenceFile(
                        '/user/ufac001/project1920/samples/enron1.seq'))))

        self.assertEqual(
            result, '''
            ('*****@*****.**', '*****@*****.**',
            datetime.datetime(2000, 7, 31, 5, 48, 
            tzinfo=datetime.timezone(datetime.timedelta(-1, 61200))))
            ('*****@*****.**', '*****@*****.**',
            datetime.datetime(2000, 7, 31, 5, 48, 
            tzinfo=datetime.timezone(datetime.timedelta(-1, 61200))))
            ('*****@*****.**', '*****@*****.**', 
            datetime.datetime(2000, 7, 31, 5, 48, 
            tzinfo=datetime.timezone(datetime.timedelta(-1, 61200))))
            ('*****@*****.**', '*****@*****.**', 
            datetime.datetime(2000, 7, 31, 5, 48, 
            tzinfo=datetime.timezone(datetime.timedelta(-1, 61200))))
            ('*****@*****.**', '*****@*****.**', 
            datetime.datetime(2000, 7, 31, 5, 48, 
            tzinfo=datetime.timezone(datetime.timedelta(-1, 61200))))
            ('*****@*****.**', '*****@*****.**', 
            datetime.datetime(2000, 7, 31, 5, 48, 
            tzinfo=datetime.timezone(datetime.timedelta(-1, 61200))))
            ''')
Example #2
0
    def test_Q2_enron1(self):
        result = self.pretty_rdd(
            convert_to_weighted_network(
                extract_email_network(
                    self.utf8_decode_and_filter(
                        self.sc.sequenceFile(
                            '/user/ufac001/project1920/samples/enron1.seq')))))

        self.assertEqual(
            result, '''
        ('*****@*****.**', '*****@*****.**', 1)
        ('*****@*****.**', '*****@*****.**', 1)
        ('*****@*****.**', '*****@*****.**', 1)
        ('*****@*****.**', '*****@*****.**', 1)
        ('*****@*****.**', '*****@*****.**', 1)
        ('*****@*****.**', '*****@*****.**', 1)
        ''')
Example #3
0
    def test_Q4_1(self):
        result = self.pretty_rdd(
            get_out_degree_dist(
                convert_to_weighted_network(
                    extract_email_network(
                        self.utf8_decode_and_filter(
                            self.sc.sequenceFile(
                                '/user/ufac001/project1920/samples/enron20.seq'
                            ))))))

        self.assertEqual(
            result, '''
        (0, 14)
        (1, 5)
        (2, 3)
        (5, 1)
        (8, 1)
        (26, 1)
        ''')
Example #4
0
    def test_Q4_2(self):
        result = self.pretty_rdd(
            get_in_degree_dist(
                convert_to_weighted_network(
                    extract_email_network(
                        self.utf8_decode_and_filter(
                            self.sc.sequenceFile(
                                '/user/ufac001/project1920/samples/enron20.seq'
                            ))))))

        self.assertEqual(
            result, '''
        (0, 8)
        (1, 10)
        (2, 1)
        (3, 2)
        (4, 2)
        (9, 1)
        (15, 1)
        ''')
Example #5
0
    def test_Q2_enron20(self):
        result = self.pretty_rdd(
            convert_to_weighted_network(
                extract_email_network(
                    self.utf8_decode_and_filter(
                        self.sc.sequenceFile(
                            '/user/ufac001/project1920/samples/enron20.seq'))),
                (datetime(2000, 10, 1, tzinfo=timezone.utc),
                 datetime(2001, 9, 1, tzinfo=timezone.utc))))

        self.assertEqual(
            result, '''
        ('*****@*****.**', '*****@*****.**', 1)
        ('*****@*****.**', '*****@*****.**', 1)
        ('*****@*****.**', '*****@*****.**', 1)
        ('*****@*****.**', '*****@*****.**', 2)
        ('*****@*****.**', '*****@*****.**', 2)
        ('*****@*****.**', '*****@*****.**', 2)
        ('*****@*****.**', '*****@*****.**', 2)
        ('*****@*****.**', '*****@*****.**', 1)
        ''')
Example #6
0
    def test_Q3_2(self):
        result = self.pretty_rdd(
            get_in_degrees(
                convert_to_weighted_network(
                    extract_email_network(
                        self.utf8_decode_and_filter(
                            self.sc.sequenceFile(
                                '/user/ufac001/project1920/samples/enron20.seq'
                            ))))))

        self.assertEqual(
            result, '''
        (15, '*****@*****.**')
        (9, '*****@*****.**')
        (4, '*****@*****.**')
        (4, '*****@*****.**')
        (3, '*****@*****.**')
        (3, '*****@*****.**')
        (2, '*****@*****.**')
        (1, '*****@*****.**')
        (1, '*****@*****.**')
        (1, '*****@*****.**')
        (1, '*****@*****.**')
        (1, '*****@*****.**')
        (1, '*****@*****.**')
        (1, '*****@*****.**')
        (1, '*****@*****.**')
        (1, '*****@*****.**')
        (1, '*****@*****.**')
        (0, '*****@*****.**')
        (0, '*****@*****.**')
        (0, '*****@*****.**')
        (0, '*****@*****.**')
        (0, '*****@*****.**')
        (0, '*****@*****.**')
        (0, '*****@*****.**')
        (0, '*****@*****.**')
        ''')
# datetime.datetime(2000, 7, 31, 5, 48, 
# tzinfo=datetime.timezone(datetime.timedelta(-1, 61200))))
# ('*****@*****.**', '*****@*****.**', 
# datetime.datetime(2000, 7, 31, 5, 48, 
# tzinfo=datetime.timezone(datetime.timedelta(-1, 61200))))
# ('*****@*****.**', '*****@*****.**', 
# datetime.datetime(2000, 7, 31, 5, 48, 
# tzinfo=datetime.timezone(datetime.timedelta(-1, 61200))))
# ('*****@*****.**', '*****@*****.**', 
# datetime.datetime(2000, 7, 31, 5, 48, 
# tzinfo=datetime.timezone(datetime.timedelta(-1, 61200))))
    

    print(pretty_rdd(convert_to_weighted_network(
        extract_email_network(
        utf8_decode_and_filter(sc.sequenceFile(
                'sample-datasets/enron20.seq'))), 
                (datetime(2000, 10, 1, tzinfo = timezone.utc), 
                 datetime(2001, 9, 1, tzinfo = timezone.utc)))))
# Expected output:
# ('*****@*****.**', '*****@*****.**', 1)
# ('*****@*****.**', '*****@*****.**', 1)
# ('*****@*****.**', '*****@*****.**', 1)
# ('*****@*****.**', '*****@*****.**', 2)
# ('*****@*****.**', '*****@*****.**', 2)
# ('*****@*****.**', '*****@*****.**', 2)
# ('*****@*****.**', '*****@*****.**', 2)
# ('*****@*****.**', '*****@*****.**', 1)

    
    print(pretty_rdd(convert_to_weighted_network(
Example #8
0
    def utf_decode(s):
        try:
            return str(s, 'utf-8')
        except:
            pass

    return rdd.map(lambda x: utf_decode(x[1])).filter(lambda x: x != None)


if __name__ == '__main__':

    # Q1 test
    print(
        pretty_rdd(
            extract_email_network(
                utf8_decode_and_filter(
                    sc.sequenceFile(
                        '/user/ufac001/project1920/samples/enron1.seq')))))
    # Expected output:
    # ('*****@*****.**', '*****@*****.**',
    # datetime.datetime(2000, 7, 31, 5, 48,
    # tzinfo=datetime.timezone(datetime.timedelta(-1, 61200))))
    # ('*****@*****.**', '*****@*****.**',
    # datetime.datetime(2000, 7, 31, 5, 48,
    # tzinfo=datetime.timezone(datetime.timedelta(-1, 61200))))
    # ('*****@*****.**', '*****@*****.**',
    # datetime.datetime(2000, 7, 31, 5, 48,
    # tzinfo=datetime.timezone(datetime.timedelta(-1, 61200))))
    # ('*****@*****.**', '*****@*****.**',
    # datetime.datetime(2000, 7, 31, 5, 48,
    # tzinfo=datetime.timezone(datetime.timedelta(-1, 61200))))
    # ('*****@*****.**', '*****@*****.**',