def get_website_object(url):
    """
    This function uses the custom built parser (roblyparser) to parse the url,
    creates a website object for easy access
    to all html elements that are to be stored in the database.
    Params : url        The url of the website to be parsed
    Return : website    Website object containing all websites robly_data
    """
    try:
        print("[Robly] Parsing {}".format(url))
        #Parse website
        parser = RoblyParser()
        html_object = parser.get_webpage_as_object(url)
        pagerank = 0
        try:
            #Get website pagerank from google
            pagerank = get_page_rank(url)
        except:
            pass
        #Create website object
        website = Website(html_object.url, html_object.title,
                          list(set(html_object.h1s)),
                          list(set(html_object.links)),
                          list(set(html_object.images)), html_object.body,
                          html_object.description,
                          list(set(html_object.keywords)),
                          html_object.robots_index, pagerank)
        return website
    except Exception as e:
        print(str(e))
        logging.error("[ROBLY] crawler.py - error parsing website - " + str(e))
        return Website()
Beispiel #2
0
def get_website_object(url):
    """
    This function uses the custom built parser (roblyparser) to parse the url,
    creates a website object for easy access
    to all html elements that are to be stored in the database.
    Params : url        The url of the website to be parsed
    Return : website    Website object containing all websites robly_data
    """
    try:
        print("[Robly] Parsing {}".format(url))
        #Parse website
        parser = RoblyParser()
        html_object = parser.get_webpage_as_object(url)
        pagerank = 0
        try:
            #Get website pagerank from google
            pagerank = get_page_rank(url)
        except:
            pass
        #Create website object
        website = Website(html_object.url, html_object.title, list(set(html_object.h1s)), list(set(html_object.links)),
                          list(set(html_object.images)), html_object.body, html_object.description,
                          list(set(html_object.keywords)), html_object.robots_index, pagerank)
        return website
    except Exception as e:
        print(str(e))
        logging.error("[ROBLY] crawler.py - error parsing website - " + str(e))
        return Website()
Beispiel #3
0
 def test_getpr(self):
     pr = get_page_rank("http://stackoverflow.com/questions/15014310/python3-xrange-lack-hurts")
     self.assertIsNotNone(pr)
 def test_getpr(self):
     pr = get_page_rank(
         "http://stackoverflow.com/questions/15014310/python3-xrange-lack-hurts"
     )
     self.assertIsNotNone(pr)