def get_website_object(url): """ This function uses the custom built parser (roblyparser) to parse the url, creates a website object for easy access to all html elements that are to be stored in the database. Params : url The url of the website to be parsed Return : website Website object containing all websites robly_data """ try: print("[Robly] Parsing {}".format(url)) #Parse website parser = RoblyParser() html_object = parser.get_webpage_as_object(url) pagerank = 0 try: #Get website pagerank from google pagerank = get_page_rank(url) except: pass #Create website object website = Website(html_object.url, html_object.title, list(set(html_object.h1s)), list(set(html_object.links)), list(set(html_object.images)), html_object.body, html_object.description, list(set(html_object.keywords)), html_object.robots_index, pagerank) return website except Exception as e: print(str(e)) logging.error("[ROBLY] crawler.py - error parsing website - " + str(e)) return Website()
def generate_website(): website = Website() website.url = "http://thisisanexampleroblyurl.com" website.description = "Sample description of robly website" website.h1s = ["Awesome", "Amazing"] website.images = ["http://static3.wikia.nocookie.net/__cb20130606164014/animalcrossing/images/3/30/Monkey.jpg", "http://www.thelostogle.com/wp-content/uploads/2013/12/happy-monkey-550x366.jpg"] website.keywords = ["one keyword", "two keyword", "three keyword"] website.links = ["http://google.com", "http://facebook.com", "http://play.google.com"] website.robots_index = True website.title = "Super Awesome Battery Stuff" website.non_html = """ A new nanotechnology that doubles the life of smartphone, laptop and electric-vehicle batteries even after being charged and discharged more than 1,000 times has been developed by researchers at the University of Limerick. The breakthrough means the research team could be tapping into a market estimated to be worth US$53.7bn by 2020. “We have developed a new germanium nanowire-based anode that has the ability to greatly increase the capacity and lifetimes of lithium-ion batteries,” said lead researcher Dr Kevin Ryan. The research published by the journal Nano Letters outlines the findings. “This breakthrough is important for mobile computing and telecoms but also for the emerging electric-vehicle market, allowing for smaller and lighter batteries that can hold more charge for longer and maintain this performance over the lifetime of the product.” Small is the next big thing The research team has also ensured its nanotechnology solution is scalable, low-cost and low-energy, making the technology both greener and commercially viable. The research has been supported by Science Foundation Ireland (SFI) under the Principal Investigator Program to Dr Kevin Ryan and also by EU funding through the GREENLION Project. “The typical lithium-ion battery on the market today is based on graphite and has a relatively low capacity. This limits the amount of energy which can be stored. In our research we used an alternative element, germanium, which is of a higher capacity,” Ryan said. “The challenge has been that the material expands quite dramatically during charging and falls apart after a relatively small number of cycles. “By using nanotechnology, we have found a way to restructure germanium, in the form of nanowires, into a stable porous material that is an ideal battery material as it remains stable over very long time scales during continued operation,” Ryan added. """ return website
def get_webpage_as_object(self, url): try: html, status = self.get_html(url) if html and status == 200: tokeniser = Tokens() tokens = tokeniser.tokenise(html) objectifier = HTMLObject() objectifier.tokens_to_html_object(tokens, url) return objectifier else: return Website() except: return Website()
def get_websites_from_duckduckgo(search_query): """ Called when no search results are found locally. Returns related results from DuckDuckGo's API """ websites = [] response = query(search_query, useragent='RoblySearch') related = response.related for r in related: try: w = Website(r.url, r.text, h1s=[], links=[], images=[], non_html=[], description=r.text, keywords=[], robots_index=True) websites.append(w) except: pass return websites
def convert_dict_to_website_object(self, website_dict): """ Dictionary to Website object. """ website = Website() website.url = website_dict['url'] website.description = website_dict['description'] website.h1s = website_dict['h1s'] website.images = website_dict['images'] website.keywords = website_dict['keywords'] website.links = website_dict['links'] website.robots_index = website_dict['robots_index'] website.title = website_dict['title'] website.non_html = website_dict['non_html'] try: website.pagerank = website_dict['pagerank'] except: pass return website
def generate_website(): website = Website() website.url = "http://thisisanexampleroblyurl.com" website.description = "Sample description of robly website" website.h1s = ["Awesome", "Amazing"] website.images = [ "http://static3.wikia.nocookie.net/__cb20130606164014/animalcrossing/images/3/30/Monkey.jpg", "http://www.thelostogle.com/wp-content/uploads/2013/12/happy-monkey-550x366.jpg" ] website.keywords = ["one keyword", "two keyword", "three keyword"] website.links = [ "http://google.com", "http://facebook.com", "http://play.google.com" ] website.robots_index = True website.title = "Super Awesome Battery Stuff" website.non_html = """ A new nanotechnology that doubles the life of smartphone, laptop and electric-vehicle batteries even after being charged and discharged more than 1,000 times has been developed by researchers at the University of Limerick. The breakthrough means the research team could be tapping into a market estimated to be worth US$53.7bn by 2020. “We have developed a new germanium nanowire-based anode that has the ability to greatly increase the capacity and lifetimes of lithium-ion batteries,” said lead researcher Dr Kevin Ryan. The research published by the journal Nano Letters outlines the findings. “This breakthrough is important for mobile computing and telecoms but also for the emerging electric-vehicle market, allowing for smaller and lighter batteries that can hold more charge for longer and maintain this performance over the lifetime of the product.” Small is the next big thing The research team has also ensured its nanotechnology solution is scalable, low-cost and low-energy, making the technology both greener and commercially viable. The research has been supported by Science Foundation Ireland (SFI) under the Principal Investigator Program to Dr Kevin Ryan and also by EU funding through the GREENLION Project. “The typical lithium-ion battery on the market today is based on graphite and has a relatively low capacity. This limits the amount of energy which can be stored. In our research we used an alternative element, germanium, which is of a higher capacity,” Ryan said. “The challenge has been that the material expands quite dramatically during charging and falls apart after a relatively small number of cycles. “By using nanotechnology, we have found a way to restructure germanium, in the form of nanowires, into a stable porous material that is an ideal battery material as it remains stable over very long time scales during continued operation,” Ryan added. """ return website