def get_website_object(url): """ This function uses the custom built parser (roblyparser) to parse the url, creates a website object for easy access to all html elements that are to be stored in the database. Params : url The url of the website to be parsed Return : website Website object containing all websites robly_data """ try: print("[Robly] Parsing {}".format(url)) #Parse website parser = RoblyParser() html_object = parser.get_webpage_as_object(url) pagerank = 0 try: #Get website pagerank from google pagerank = get_page_rank(url) except: pass #Create website object website = Website(html_object.url, html_object.title, list(set(html_object.h1s)), list(set(html_object.links)), list(set(html_object.images)), html_object.body, html_object.description, list(set(html_object.keywords)), html_object.robots_index, pagerank) return website except Exception as e: print(str(e)) logging.error("[ROBLY] crawler.py - error parsing website - " + str(e)) return Website()
def test_getpr(self): pr = get_page_rank("http://stackoverflow.com/questions/15014310/python3-xrange-lack-hurts") self.assertIsNotNone(pr)
def test_getpr(self): pr = get_page_rank( "http://stackoverflow.com/questions/15014310/python3-xrange-lack-hurts" ) self.assertIsNotNone(pr)