def test_parse_start(self, fake_generic_parser, fake_reddit_parser): ''' Test how crawler parses starting urls ''' response = mock_response(url='https://www.ubc.ca') self.spider.parse_start_urls(response) self.assertTrue(fake_generic_parser.called) response = mock_response(url='https://www.reddit.com') self.spider.parse_start_urls(response) self.assertTrue(fake_reddit_parser.called)
def test_parse_generic_item(self): """ Test single item parse """ response = mock_response('/test_data/ubc.txt', 'http://www.ubc.ca') links = ['http://www.google.com', 'http://www.reddit.com'] item = parser.parse_generic_item(response, links) item = ScrapyGenericPage(item) self.assertEqual(item['url'], "http://www.ubc.ca") self.assertTrue(len(item['raw_content']) > 0) self.assertTrue(len(item['links']) > 0) self.assertEqual( item['description'], "The University of British Columbia is a global centre for research and teaching, consistently ranked among the top 20 public universities in the world." ) self.assertEqual(item['links'], links) self.assertEqual(item['title'], "Homepage") self.assertEqual(item['site_title'], "The University of British Columbia") # Check that there are no HTML tags, no blank lines, no JavaScript html_regexp = re.compile(r'<[^>]*?>') js_regexp = re.compile(r'{[^*]*?}') for line in item['raw_content']: self.assertTrue(len(line) > 0) self.assertFalse(html_regexp.search(line)) self.assertFalse(js_regexp.search(line))
def test_karma_fail(self): ''' Test if the parser discards low-karma or no-karma posts ''' response = mock_response() item = parser.parse_post(response, []) self.assertFalse(item)
def test_parse_course(self): ''' Test courses parsing ''' response = mock_response( '/test_data/courses.txt', 'https://courses.students.ubc.ca/cs/main?pname=subjarea&tname=subjareas&req=1&dept=ASTR' ) response.meta['data'] = {"url":"some_url"} output = list(parser.parse_course(response)) expected_courses = [ ScrapyCourseItem( subject={"url":"some_url"}, url="https://courses.students.ubc.ca/cs/main?pname=subjarea&tname=subjareas&req=3&dept=GRSJ&course=101", name="GRSJ 101 Introduction to Social Justice" ), ScrapyCourseItem( subject={"url":"some_url"}, url="https://courses.students.ubc.ca/cs/main?pname=subjarea&tname=subjareas&req=3&dept=GRSJ&course=102", name="GRSJ 102 Global Issues in Social Justice" ) ] #print(expected_courses[0]['url'].replace(';jsessionid=[^?]*', '')) self.assertEquals(output[0].callback.__name__, parser.parse_course_details.__name__) self.assertEquals(output[0].meta['data']['url'],expected_courses[0]['url']) self.assertEquals(output[0].priority, 100) self.assertEquals(output[1].meta['data'],expected_courses[1])
def test_title_parsing(self): """ Test how site_title is assembled from site Title element """ response = mock_response( '<title>Homepage - Subtitle 1 - Subtitle 2 - The University of British Columbia</title>' ) item = ScrapyGenericPage(parser.parse_generic_item(response, [])) self.assertEqual(item['site_title'], "Subtitle 2 - The University of British Columbia") response = mock_response( '<title>Engineering alumna gives back as a WiSE Mentor | Women in Science and Engineering</title>' ) item = ScrapyGenericPage(parser.parse_generic_item(response, [])) self.assertEqual(item['site_title'], "Women in Science and Engineering")
def test_parse_reddit_post(self, fake_parser): ''' Test crawler's redirect to reddit_parser ''' response = mock_response(file_name='/test_data/reddit_text_post.txt') self.spider.parse_reddit_post(response) self.assertTrue(fake_parser.called) links_arg = fake_parser.call_args[0][1] self.assertTrue(len(links_arg)>0)
def test_parse_generic_item(self, fake_parser): ''' Test crawler's redirect to generic_page_parser as default parser ''' response = mock_response(file_name='/test_data/ubc.txt') self.spider.parse_generic_item(response) self.assertTrue(fake_parser.called) links_arg = fake_parser.call_args[0][1] self.assertTrue(len(links_arg)>0) self.assertFalse('http://www.ubc.ca' in links_arg)
def test_parse_course_details(self): ''' Test course details parsing ''' response = mock_response('/test_data/course_details.txt', 'https://courses.students.ubc.ca/cs/main?pname=subjarea&tname=subjareas&req=3&dept=ASTR&course=200') response.meta['data'] = ScrapyCourseItem(subject="",url="",name="") output = parser.parse_course_details(response) expected_course = ScrapyCourseItem( subject="", url="", name="", description="An overview of intersectional feminist debates and theoretical traditions. Credit will be granted for only one of WMST 100 or GRSJ 101." ) self.assertEquals(output, expected_course)
def test_opengraph_metadata_use(self): ''' Test how OpenGraph metadata is used ''' response = mock_response('/test_data/metadata.txt', 'https://www.ubyssey.ca/') item = ScrapyGenericPage(parser.parse_generic_item(response, [])) self.assertEqual(item['title'], "OG The Ubyssey - UBC's official student newspaper") self.assertEqual(item['site_title'], 'OG The Ubyssey') self.assertEqual( item['description'], 'OG Weekly student newspaper of the University of British Columbia.' )
def test_parse_text_post(self): ''' Test parsing a reddit text post ''' response = mock_response('/test_data/reddit_text_post.txt', 'http://www.reddit.com/') links = ['http://www.google.com', 'http://www.reddit.com'] item = parser.parse_post(response, links) item = ScrapyRedditPost(item) self.assertEqual('UBC', item['subreddit']) self.assertEqual( "As a first year student it's really hard to get into the UBC discord", item['title'] ) self.assertEqual( "Don't worry, it feels like that for everyone.At some point, the UBC discord became it's own little circle-jerk of friends, exclusive to anyone else. There are about 8-10 regular users, who communicate mainly through inside jokes and 4chan-esque internet humor. You're better off without them, I guarantee.", item['comments'][0] )
def test_parse_subjects(self): ''' Test subjects parsing ''' response = mock_response('/test_data/subjects.txt', 'https://courses.students.ubc.ca/cs/main?pname=subjarea&tname=subjareas&req=0') output = list(parser.parse_subjects(response)) expected_subjects = [ { "url": "https://courses.students.ubc.ca/cs/main?pname=subjarea&tname=subjareas&req=1&dept=AANB", "name": "AANB Applied Animal Biology", "faculty": "Faculty of Land and Food Systems" }, { "url": "https://courses.students.ubc.ca/cs/main?pname=subjarea&tname=subjareas&req=1&dept=ACAM", "name": "ACAM Asian Canadian and Asian Migration Studies", "faculty": "Faculty of Arts" } ] self.assertEquals(output[0].callback.__name__, parser.parse_course.__name__) self.assertEquals(output[0].meta['data'],expected_subjects[0]) self.assertEquals(output[0].priority, 100) self.assertEquals(output[1].meta['data'],expected_subjects[1])
def test_no_parse(self): response = mock_response(file_name='/test_data/reddit_text_post.txt') self.spider.no_parse(response)