コード例 #1
0
class MyApp(object):
    """
    This is my application that has a lot of work to do so it gives work to do
    to its slaves until all the work is done
    """
    def __init__(self, slaves):
        # when creating the Master we tell it what slaves it can handle
        self.master = Master(slaves)

    def terminate_slaves(self):
        """
        Call this to make all slaves exit their run loop
        """
        self.master.terminate_slaves()

    def run(self, tasks=10):
        """
        This is the core of my application, keep starting slaves
        as long as there is work to do
        """

        work_queue = [i for i in range(tasks)
                      ]  # let's pretend this is our work queue

        #
        # while we have work to do and not all slaves completed
        #
        while work_queue or not self.master.done():

            #
            # give work to do to each idle slave
            #
            for slave in self.master.get_ready_slaves():

                if not work_queue:
                    break
                task = work_queue.pop(0)  # get next task in the queue

                print('Master: slave %d is going to do task %d' %
                      (slave, task))
                self.master.run(slave, data=('Do task', task))

            #
            # reclaim slaves that have finished working
            # so that we can assign them more work
            #
            for slave in self.master.get_completed_slaves():
                done, message = self.master.get_data(slave)
                if done:
                    print('Master: slave %d finished is task and says "%s"' %
                          (slave, message))
                else:
                    print('Master: slave %d failed to accomplish his task' %
                          slave)

            # sleep some time
            time.sleep(0.3)
コード例 #2
0
class MyApp(object):
    """
    This is my application that has a lot of work to do so it gives work to do
    to its slaves until all the work is done
    """
    def __init__(self, slaves):

        # when creating the Master we tell it what slaves it can handle
        self.master = Master(slaves)

        self.url_queue = [
            'http://riweb.tibeica.com/crawl/'
        ]  # 'http://fanacmilan.com/' 'http://riweb.tibeica.com/crawl/' 'http://www.greenworldmoldova.com/'
        self.visited = {}

        self.UTILS = Utils()

        self.limit = 200

    def terminate_slaves(self):
        """
        Call this to make all slaves exit their run loop
        """
        self.master.terminate_slaves()

    def run(self):
        """
        This is the core of my application, keep starting slaves
        as long as there is work to do
        """

        #
        # while we have work to do and not all slaves completed
        #
        while self.url_queue or not self.master.done():

            #
            # give work to do to each idle slave
            #
            for slave in self.master.get_ready_slaves():

                if not self.url_queue:
                    break
                current_url = self.url_queue.pop(
                    0)  # get next url in the queue
                if current_url in self.visited:
                    continue

                print('Slave {0} is going to process url {1}'.format(
                    slave, current_url))

                # check url in robots
                rp = urllib.robotparser.RobotFileParser()
                try:
                    url = urlparse(current_url)
                    rp.set_url(url.scheme + '://' + url.netloc + '/robots.txt')
                    self.read_robots(rp)
                    if not rp.can_fetch(self.UTILS.USER_AGENT, current_url):
                        continue
                except:
                    continue

                # set to visited current url
                self.visited[url.scheme + '://' + url.netloc + url.path] = True
                self.master.run(slave, data=current_url)
                self.limit -= 1
                print('Limit: {}'.format(self.limit))

            #
            # reclaim slaves that have finished working
            # so that we can assign them more work
            #
            for slave in self.master.get_completed_slaves():
                done, code, file_path, url = self.master.get_data(slave)
                if done:
                    if code == 200:
                        self.get_links(file_path, url)
                    else:
                        new_location = file_path
                        if code == 301:
                            try:
                                self.visited.pop(url.geturl(), None)
                            except:
                                pass
                        if new_location not in self.url_queue and new_location not in self.visited:
                            self.url_queue.insert(0, new_location)
                else:
                    print(
                        'Failed to process the url: {0} --- Response code: {1}'
                        .format(url.geturl(), code))

                if self.limit <= 0:
                    self.terminate_slaves()

    def get_links(self, html_file, url):
        with open(html_file, 'r') as file:
            soup = BeautifulSoup(file, features="html.parser")

        meta_robots = soup.find("meta", attrs={"name": "robots"})
        # if robots meta exists, check if the following is allowed
        if meta_robots:
            meta_robots_content = meta_robots['content']
            if 'nofollow' in meta_robots_content:
                return

        for a in soup.find_all('a', href=True):
            href = a['href'].strip()
            if href == '' or href[0] == '#':
                continue

            if '#' in href:
                href = ''.join(href.split('#')[:-1])

            if 'https' in href or 'http' in href and href not in self.visited:
                self.url_queue.append(href)
            else:
                new_url = urljoin(url.geturl(), href)
                if new_url not in self.visited and new_url not in self.url_queue:
                    self.url_queue.append(new_url)

    @timeout(0.7)
    def read_robots(self, rp):
        rp.read()