Example #1
0
    def work(self, thread_name, url_dao):
        """
        This method is assigned to threads.
        As long as there are items in the queue this method wil start crawling them.

        :param thread_name: The name of the current thread
        :param url_dao: an instance of UrlDAO
        :return: Nothing
        """
        try:
            while True:
                url = self.queue.get()
                url_id = HashService.num_md5(url.url_string)
                timestamp = datetime.now().strftime(
                    '%Y-%m-%dT%H:%M:%S.%f')[:-3] + "Z"
                MyLogger.log(
                    LOG.CRAWLER, "url_id=" + str(url_id) + " url=" +
                    str(url.url_string) + " @timestamp=" + timestamp)
                Spider(url, thread_name, url_dao, self.redis).run()
                self.crawled.add(url)
                self.parser.add_link_to_queue(url)
                self.parser.start()
                self.queue.task_done()
        except BlacklistNotFoundError:
            while self.queue.unfinished_tasks > 0:
                self.queue.task_done()
            raise MyThreadError
Example #2
0
    def __crawl_page(self, url):
        """
        This method is the main method of the spider class.
        If the layer of the url or the size of crawled is bigger than it's corresponding property the program
        will clear the queue.
        If it's not it wil start crawling the page by opening a request and getting the html.
        It'll then save the html to redis.
        After that it'll gather all links from that page and add those links to the queue.

        :param url: URL object
        :return: nothing
        """
        start_time = time.time()
        # Check if url is already crawled or max depth of urls is not exceeded
        if url.layer > Properties.SPIDER_MAX_DEPTH \
                or len(self.crawled) > Properties.SPIDER_MAX_PAGES:
            self.deque.clear()
        else:
            try:
                request = self.__get_request(url)
                html = self.__get_html(request)
                if len(html) > 0:
                    self.__save_html_to_redis(html)
                    self.__add_links_to_queue(Spider.__gather_links(url, html))
                self.crawled.add(url)
                print(
                    self.name,
                    "is now crawling {}\n\t\t\t\t\t\t Queue {} | Crawled {} | Layer: {} | Duration: {}"
                    .format(str(url), str(len(self.deque)),
                            str(len(self.crawled)), str(url.layer),
                            time.time() - start_time))
            except req.HTTPError as e:
                MyLogger.log(
                    LOG.SPIDER, "HTTP Error occurred [{0}]: {1} {2}".format(
                        str(e.code), e.filename, e.reason))
            except req.URLError as e:
                MyLogger.log(LOG.SPIDER,
                             "URL Error occurred: {0}".format(e.reason))
            except ssl.SSLError as e:
                MyLogger.log(LOG.SPIDER, "SSL Error occurred: {0}".format(e))
            except socket.timeout as e:
                MyLogger.log(LOG.SPIDER, "Timeout occurred: {0}".format(e))
Example #3
0
 def __get_html(self, request):
     """
     This method will return the HTML of the request.
     :param request: Request object
     :return: The HTML of the request object
     """
     html_string = ''
     try:
         response = req.urlopen(request, timeout=self.TIMEOUT_TIME)
         if 'text/html' in response.getheader('Content-Type'):
             html_bytes = response.read()
             html_string = html_bytes.decode("utf-8").strip()
     except UnicodeDecodeError as e:
         MyLogger.log(LOG.SPIDER,
                      "UnicodeDecodeError occurred: {0}".format(e))
     except socket.timeout as e:
         MyLogger.log(LOG.SPIDER, "Timeout occurred: {0}".format(e))
     except ConnectionResetError as e:
         MyLogger.log(
             LOG.SPIDER, "ConnectionResetError occurred [{0}]: {1}".format(
                 str(e.errno), e.strerror))
     except ssl.CertificateError as e:
         MyLogger.log(LOG.SPIDER,
                      "SSL CertificateError: {0}".format(e.args))
     except BadStatusLine as e:
         MyLogger.log(LOG.SPIDER, "BadStatusLine: {0}".format(e.args))
     except IncompleteRead as e:
         MyLogger.log(LOG.SPIDER, "IncompleteRead: {0}".format(e.args))
     return html_string