Example #1
0
 def test_parse_start(self, fake_generic_parser, fake_reddit_parser):
     '''
     Test how crawler parses starting urls
     '''
     response = mock_response(url='https://www.ubc.ca')
     self.spider.parse_start_urls(response)
     self.assertTrue(fake_generic_parser.called)
     response = mock_response(url='https://www.reddit.com')
     self.spider.parse_start_urls(response)
     self.assertTrue(fake_reddit_parser.called)
    def test_parse_generic_item(self):
        """
        Test single item parse
        """
        response = mock_response('/test_data/ubc.txt', 'http://www.ubc.ca')
        links = ['http://www.google.com', 'http://www.reddit.com']
        item = parser.parse_generic_item(response, links)
        item = ScrapyGenericPage(item)
        self.assertEqual(item['url'], "http://www.ubc.ca")
        self.assertTrue(len(item['raw_content']) > 0)
        self.assertTrue(len(item['links']) > 0)
        self.assertEqual(
            item['description'],
            "The University of British Columbia is a global centre for research and teaching, consistently ranked among the top 20 public universities in the world."
        )
        self.assertEqual(item['links'], links)
        self.assertEqual(item['title'], "Homepage")
        self.assertEqual(item['site_title'],
                         "The University of British Columbia")

        # Check that there are no HTML tags, no blank lines, no JavaScript
        html_regexp = re.compile(r'<[^>]*?>')
        js_regexp = re.compile(r'{[^*]*?}')
        for line in item['raw_content']:
            self.assertTrue(len(line) > 0)
            self.assertFalse(html_regexp.search(line))
            self.assertFalse(js_regexp.search(line))
Example #3
0
 def test_karma_fail(self):
     '''
     Test if the parser discards low-karma or no-karma posts
     '''
     response = mock_response()
     item = parser.parse_post(response, [])
     self.assertFalse(item)
 def test_parse_course(self):
     '''
     Test courses parsing
     '''
     response = mock_response(
         '/test_data/courses.txt', 
         'https://courses.students.ubc.ca/cs/main?pname=subjarea&tname=subjareas&req=1&dept=ASTR'
     )
     response.meta['data'] = {"url":"some_url"}
     output = list(parser.parse_course(response))
     expected_courses = [
         ScrapyCourseItem(
             subject={"url":"some_url"},
             url="https://courses.students.ubc.ca/cs/main?pname=subjarea&tname=subjareas&req=3&dept=GRSJ&course=101",
             name="GRSJ 101 Introduction to Social Justice"
         ),
         ScrapyCourseItem(
             subject={"url":"some_url"},
             url="https://courses.students.ubc.ca/cs/main?pname=subjarea&tname=subjareas&req=3&dept=GRSJ&course=102",
             name="GRSJ 102 Global Issues in Social Justice"
         )
     ]
     #print(expected_courses[0]['url'].replace(';jsessionid=[^?]*', ''))
     self.assertEquals(output[0].callback.__name__, parser.parse_course_details.__name__)
     self.assertEquals(output[0].meta['data']['url'],expected_courses[0]['url'])
     self.assertEquals(output[0].priority, 100)
     self.assertEquals(output[1].meta['data'],expected_courses[1])
    def test_title_parsing(self):
        """
        Test how site_title is assembled from site Title element
        """
        response = mock_response(
            '<title>Homepage - Subtitle 1 - Subtitle 2 - The University of British Columbia</title>'
        )
        item = ScrapyGenericPage(parser.parse_generic_item(response, []))
        self.assertEqual(item['site_title'],
                         "Subtitle 2 - The University of British Columbia")

        response = mock_response(
            '<title>Engineering alumna gives back as a WiSE Mentor | Women in Science and Engineering</title>'
        )
        item = ScrapyGenericPage(parser.parse_generic_item(response, []))
        self.assertEqual(item['site_title'],
                         "Women in Science and Engineering")
Example #6
0
 def test_parse_reddit_post(self, fake_parser):
     '''
     Test crawler's redirect to reddit_parser
     '''
     response = mock_response(file_name='/test_data/reddit_text_post.txt')
     self.spider.parse_reddit_post(response)
     self.assertTrue(fake_parser.called)
     links_arg = fake_parser.call_args[0][1]
     self.assertTrue(len(links_arg)>0)
Example #7
0
 def test_parse_generic_item(self, fake_parser):
     '''
     Test crawler's redirect to generic_page_parser as default parser
     '''
     response = mock_response(file_name='/test_data/ubc.txt')
     self.spider.parse_generic_item(response)
     self.assertTrue(fake_parser.called)
     links_arg = fake_parser.call_args[0][1]
     self.assertTrue(len(links_arg)>0)
     self.assertFalse('http://www.ubc.ca' in links_arg)
 def test_parse_course_details(self):
     '''
     Test course details parsing
     '''
     response = mock_response('/test_data/course_details.txt', 'https://courses.students.ubc.ca/cs/main?pname=subjarea&tname=subjareas&req=3&dept=ASTR&course=200')
     response.meta['data'] = ScrapyCourseItem(subject="",url="",name="")
     output = parser.parse_course_details(response)
     expected_course = ScrapyCourseItem(
         subject="", url="", name="",
         description="An overview of intersectional feminist debates and theoretical traditions. Credit will be granted for only one of WMST 100 or GRSJ 101."
     )
     self.assertEquals(output, expected_course)
 def test_opengraph_metadata_use(self):
     '''
     Test how OpenGraph metadata is used
     '''
     response = mock_response('/test_data/metadata.txt',
                              'https://www.ubyssey.ca/')
     item = ScrapyGenericPage(parser.parse_generic_item(response, []))
     self.assertEqual(item['title'],
                      "OG The Ubyssey - UBC's official student newspaper")
     self.assertEqual(item['site_title'], 'OG The Ubyssey')
     self.assertEqual(
         item['description'],
         'OG Weekly student newspaper of the University of British Columbia.'
     )
Example #10
0
 def test_parse_text_post(self):
     '''
     Test parsing a reddit text post
     '''
     response = mock_response('/test_data/reddit_text_post.txt', 'http://www.reddit.com/')
     links = ['http://www.google.com', 'http://www.reddit.com']
     item = parser.parse_post(response, links)
     item = ScrapyRedditPost(item)
     self.assertEqual('UBC', item['subreddit'])
     self.assertEqual(
         "As a first year student it's really hard to get into the UBC discord",
         item['title']
     )
     self.assertEqual(
         "Don't worry, it feels like that for everyone.At some point, the UBC discord became it's own little circle-jerk of friends, exclusive to anyone else. There are about 8-10 regular users, who communicate mainly through inside jokes and 4chan-esque internet humor. You're better off without them, I guarantee.",
         item['comments'][0]
     )
Example #11
0
 def test_parse_subjects(self):
     '''
     Test subjects parsing
     '''
     response = mock_response('/test_data/subjects.txt', 'https://courses.students.ubc.ca/cs/main?pname=subjarea&tname=subjareas&req=0')
     output = list(parser.parse_subjects(response))
     expected_subjects = [
         {
             "url": "https://courses.students.ubc.ca/cs/main?pname=subjarea&tname=subjareas&req=1&dept=AANB",
             "name": "AANB Applied Animal Biology",
             "faculty": "Faculty of Land and Food Systems"
         },
         {
             "url": "https://courses.students.ubc.ca/cs/main?pname=subjarea&tname=subjareas&req=1&dept=ACAM",
             "name": "ACAM Asian Canadian and Asian Migration Studies",
             "faculty": "Faculty of Arts"
         }
     ]
     self.assertEquals(output[0].callback.__name__, parser.parse_course.__name__)
     self.assertEquals(output[0].meta['data'],expected_subjects[0])
     self.assertEquals(output[0].priority, 100)
     self.assertEquals(output[1].meta['data'],expected_subjects[1])
Example #12
0
 def test_no_parse(self):
     response = mock_response(file_name='/test_data/reddit_text_post.txt')
     self.spider.no_parse(response)