def test_handleHTML_regex_unit_code_test3(self): parser = handleHTML.parse() result = parser.unit_regex.match("year 2015") if (not result): pass else: self.fail()
def test_handleHTML_regex_unit_code_test2(self): parser = handleHTML.parse() result = parser.unit_regex.match("pys101") if (result): pass else: self.fail()
def test_handleHTML_regex_heading_tag(self): parser = handleHTML.parse() result = parser.heading_tag_regex.match("h5") if (result): pass else: self.fail()
def test_handleHTML_regex_test_double_slash(self): parser = handleHTML.parse() result = parser.double_slash_regex.match("//handbooks.uwa.edu.au/units/unitdetails?code=cits3200") if (result): pass else: self.fail()
def test_handleHTML_regex_test_http(self): parser = handleHTML.parse() result = parser.http_regex.match("http://handbooks.uwa.edu.au/units/unitdetails?code=cits3200") if (result): pass else: self.fail()
# from urllib.request import urlopen from urllib.error import URLError import handleHTML from Stack import Stack #urls already visited visited = set() #urls to visit stack = Stack() in_stack = set() #parser object to manipulate HTML parser = handleHTML.parse() #parser.re_init(stack, visited, in_stack) #recursively load URL via stack def recursiveload(url, dom, db): database = db count = 0 stack.push(url) in_stack.add(url) while stack.isEmpty() != True: try: newurl = stack.pop() in_stack.remove(newurl) if dom in newurl:
def test_handleHTML_get_domain(self): parser = handleHTML.parse() parser.url = "http://handbooks.uwa.edu.au/units/unitdetails?code=cits3200" dom = parser.get_domain() self.assertEqual(dom, "handbooks.uwa.edu.au")
def test_handleHTML_handle_end_tag(self): parser = handleHTML.parse() parser.feed("<body></body>") self.assertTrue(not parser.body_tag_open)
def test_handleHTML_handle_start_tag(self): parser = handleHTML.parse() parser.feed("<body>") self.assertTrue(parser.body_tag_open)