Ejemplo n.º 1
0
def get_website_object(url):
    """
    This function uses the custom built parser (roblyparser) to parse the url,
    creates a website object for easy access
    to all html elements that are to be stored in the database.
    Params : url        The url of the website to be parsed
    Return : website    Website object containing all websites robly_data
    """
    try:
        print("[Robly] Parsing {}".format(url))
        #Parse website
        parser = RoblyParser()
        html_object = parser.get_webpage_as_object(url)
        pagerank = 0
        try:
            #Get website pagerank from google
            pagerank = get_page_rank(url)
        except:
            pass
        #Create website object
        website = Website(html_object.url, html_object.title,
                          list(set(html_object.h1s)),
                          list(set(html_object.links)),
                          list(set(html_object.images)), html_object.body,
                          html_object.description,
                          list(set(html_object.keywords)),
                          html_object.robots_index, pagerank)
        return website
    except Exception as e:
        print(str(e))
        logging.error("[ROBLY] crawler.py - error parsing website - " + str(e))
        return Website()
Ejemplo n.º 2
0
def generate_website():
    website = Website()
    website.url = "http://thisisanexampleroblyurl.com"
    website.description = "Sample description of robly website"
    website.h1s = ["Awesome", "Amazing"]
    website.images = ["http://static3.wikia.nocookie.net/__cb20130606164014/animalcrossing/images/3/30/Monkey.jpg",
                      "http://www.thelostogle.com/wp-content/uploads/2013/12/happy-monkey-550x366.jpg"]
    website.keywords = ["one keyword", "two keyword", "three keyword"]
    website.links = ["http://google.com", "http://facebook.com", "http://play.google.com"]
    website.robots_index = True
    website.title = "Super Awesome Battery Stuff"
    website.non_html = """
    A new nanotechnology that doubles the life of smartphone, laptop and electric-vehicle batteries even after being charged and discharged more than 1,000 times has been developed by researchers at the University of Limerick.
    The breakthrough means the research team could be tapping into a market estimated to be worth US$53.7bn by 2020.
    “We have developed a new germanium nanowire-based anode that has the ability to greatly increase the capacity and lifetimes of lithium-ion batteries,” said lead researcher Dr Kevin Ryan.
    The research published by the journal Nano Letters outlines the findings.
    “This breakthrough is important for mobile computing and telecoms but also for the emerging electric-vehicle market, allowing for smaller and lighter batteries that can hold more charge for longer and maintain this performance over the lifetime of the product.”
    Small is the next big thing
    The research team has also ensured its nanotechnology solution is scalable, low-cost and low-energy, making the technology both greener and commercially viable.
    The research has been supported by Science Foundation Ireland (SFI) under the Principal Investigator Program to Dr Kevin Ryan and also by EU funding through the GREENLION Project.
    “The typical lithium-ion battery on the market today is based on graphite and has a relatively low capacity. This limits the amount of energy which can be stored. In our research we used an alternative element, germanium, which is of a higher capacity,” Ryan said.
    “The challenge has been that the material expands quite dramatically during charging and falls apart after a relatively small number of cycles.
    “By using nanotechnology, we have found a way to restructure germanium, in the form of nanowires, into a stable porous material that is an ideal battery material as it remains stable over very long time scales during continued operation,” Ryan added.
    """
    return website
Ejemplo n.º 3
0
 def get_webpage_as_object(self, url):
     try:
         html, status = self.get_html(url)
         if html and status == 200:
             tokeniser = Tokens()
             tokens = tokeniser.tokenise(html)
             objectifier = HTMLObject()
             objectifier.tokens_to_html_object(tokens, url)
             return objectifier
         else:
             return Website()
     except:
         return Website()
Ejemplo n.º 4
0
def get_websites_from_duckduckgo(search_query):
    """
    Called when no search results are found locally.
    Returns related results from DuckDuckGo's API
    """
    websites = []
    response = query(search_query, useragent='RoblySearch')
    related = response.related
    for r in related:
        try:
            w = Website(r.url,
                        r.text,
                        h1s=[],
                        links=[],
                        images=[],
                        non_html=[],
                        description=r.text,
                        keywords=[],
                        robots_index=True)
            websites.append(w)
        except:
            pass
    return websites
Ejemplo n.º 5
0
 def convert_dict_to_website_object(self, website_dict):
     """
     Dictionary to Website object.
     """
     website = Website()
     website.url = website_dict['url']
     website.description = website_dict['description']
     website.h1s = website_dict['h1s']
     website.images = website_dict['images']
     website.keywords = website_dict['keywords']
     website.links = website_dict['links']
     website.robots_index = website_dict['robots_index']
     website.title = website_dict['title']
     website.non_html = website_dict['non_html']
     try:
         website.pagerank = website_dict['pagerank']
     except:
         pass
     return website
Ejemplo n.º 6
0
 def convert_dict_to_website_object(self, website_dict):
     """
     Dictionary to Website object.
     """
     website = Website()
     website.url = website_dict['url']
     website.description = website_dict['description']
     website.h1s = website_dict['h1s']
     website.images = website_dict['images']
     website.keywords = website_dict['keywords']
     website.links = website_dict['links']
     website.robots_index = website_dict['robots_index']
     website.title = website_dict['title']
     website.non_html = website_dict['non_html']
     try:
         website.pagerank = website_dict['pagerank']
     except:
         pass
     return website
Ejemplo n.º 7
0
def generate_website():
    website = Website()
    website.url = "http://thisisanexampleroblyurl.com"
    website.description = "Sample description of robly website"
    website.h1s = ["Awesome", "Amazing"]
    website.images = [
        "http://static3.wikia.nocookie.net/__cb20130606164014/animalcrossing/images/3/30/Monkey.jpg",
        "http://www.thelostogle.com/wp-content/uploads/2013/12/happy-monkey-550x366.jpg"
    ]
    website.keywords = ["one keyword", "two keyword", "three keyword"]
    website.links = [
        "http://google.com", "http://facebook.com", "http://play.google.com"
    ]
    website.robots_index = True
    website.title = "Super Awesome Battery Stuff"
    website.non_html = """
    A new nanotechnology that doubles the life of smartphone, laptop and electric-vehicle batteries even after being charged and discharged more than 1,000 times has been developed by researchers at the University of Limerick.
    The breakthrough means the research team could be tapping into a market estimated to be worth US$53.7bn by 2020.
    “We have developed a new germanium nanowire-based anode that has the ability to greatly increase the capacity and lifetimes of lithium-ion batteries,” said lead researcher Dr Kevin Ryan.
    The research published by the journal Nano Letters outlines the findings.
    “This breakthrough is important for mobile computing and telecoms but also for the emerging electric-vehicle market, allowing for smaller and lighter batteries that can hold more charge for longer and maintain this performance over the lifetime of the product.”
    Small is the next big thing
    The research team has also ensured its nanotechnology solution is scalable, low-cost and low-energy, making the technology both greener and commercially viable.
    The research has been supported by Science Foundation Ireland (SFI) under the Principal Investigator Program to Dr Kevin Ryan and also by EU funding through the GREENLION Project.
    “The typical lithium-ion battery on the market today is based on graphite and has a relatively low capacity. This limits the amount of energy which can be stored. In our research we used an alternative element, germanium, which is of a higher capacity,” Ryan said.
    “The challenge has been that the material expands quite dramatically during charging and falls apart after a relatively small number of cycles.
    “By using nanotechnology, we have found a way to restructure germanium, in the form of nanowires, into a stable porous material that is an ideal battery material as it remains stable over very long time scales during continued operation,” Ryan added.
    """
    return website