Ejemplo n.º 1
0
 def gatherLinks(pageURL):
     html_string = ''
     try:
         response = urlopen(pageURL)
         if 'text/html' in response.getheader('Content-Type'):
             html_bytes = response.read()
             html_string = html_bytes.decode("utf-8")
         finderObject = Finder(Spider.baseURL, pageURL)
         finderObject.feed(html_string)
     except Exception as e:
         print(str(e))
         return set()
     return finderObject.pageLink()
Ejemplo n.º 2
0
 def gather_links(page_url):
     html_string = ''
     try:
         response = urlopen(page_url)
         if 'text/html' in response.getheader('Content-Type'):
             html_bytes = response.read()
             html_string = html_bytes.decode('utf-8')
         finder = Finder(Spider.base_url, page_url)
         finder.feed(html_string)
     except Exception as e:
         print(str(e))
         return set()
     return finder.page_links()
Ejemplo n.º 3
0
 def gather_link(page_url):
     html_string = ''
     req = Request(page_url, headers={'User-Agent': 'Mozilla/5.0'})
     context = ssl._create_unverified_context()
     response = urlopen(req, context=context)
     charset = str(response.getheader('Content-Type')).split('charset=')[1]
     try:
         if 'text/html;' in response.getheader('Content-Type'):
             html_bytes = response.read()
             html_string = html_bytes.decode('utf-8')
         parser = TheHTMLParse(html_string, Spider.project_name, page_url)
         finder = Finder(Spider.base_url, page_url)
         finder.feed(str(html_string))
     except:
         logthis("Spider. Sorry sir i can't crawl this page ...",
                 Spider.project_name)
         return set()
     return finder.page_links()
Ejemplo n.º 4
0
    def crawl(url):
        if not url in spider.links_crawled:
            req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
            with urlopen(req) as urldata:
                data = urldata.read()
            f = Finder()
            f.baseurl=spider.website_url
            data = data.decode('utf-8')
            data = html.unescape(data)
            f.feed(data)
            f.close()
            links=f.return_links()
            spider.links_website.remove(url)
            for val in links:
               spider.links_website.append(val)

            spider.links_crawled.append(url)
            spider.data_dict[url] = f.return_data()