class MyApp(object): """ This is my application that has a lot of work to do so it gives work to do to its slaves until all the work is done """ def __init__(self, slaves): # when creating the Master we tell it what slaves it can handle self.master = Master(slaves) def terminate_slaves(self): """ Call this to make all slaves exit their run loop """ self.master.terminate_slaves() def run(self, tasks=10): """ This is the core of my application, keep starting slaves as long as there is work to do """ work_queue = [i for i in range(tasks) ] # let's pretend this is our work queue # # while we have work to do and not all slaves completed # while work_queue or not self.master.done(): # # give work to do to each idle slave # for slave in self.master.get_ready_slaves(): if not work_queue: break task = work_queue.pop(0) # get next task in the queue print('Master: slave %d is going to do task %d' % (slave, task)) self.master.run(slave, data=('Do task', task)) # # reclaim slaves that have finished working # so that we can assign them more work # for slave in self.master.get_completed_slaves(): done, message = self.master.get_data(slave) if done: print('Master: slave %d finished is task and says "%s"' % (slave, message)) else: print('Master: slave %d failed to accomplish his task' % slave) # sleep some time time.sleep(0.3)
class MyApp(object): """ This is my application that has a lot of work to do so it gives work to do to its slaves until all the work is done """ def __init__(self, slaves): # when creating the Master we tell it what slaves it can handle self.master = Master(slaves) self.url_queue = [ 'http://riweb.tibeica.com/crawl/' ] # 'http://fanacmilan.com/' 'http://riweb.tibeica.com/crawl/' 'http://www.greenworldmoldova.com/' self.visited = {} self.UTILS = Utils() self.limit = 200 def terminate_slaves(self): """ Call this to make all slaves exit their run loop """ self.master.terminate_slaves() def run(self): """ This is the core of my application, keep starting slaves as long as there is work to do """ # # while we have work to do and not all slaves completed # while self.url_queue or not self.master.done(): # # give work to do to each idle slave # for slave in self.master.get_ready_slaves(): if not self.url_queue: break current_url = self.url_queue.pop( 0) # get next url in the queue if current_url in self.visited: continue print('Slave {0} is going to process url {1}'.format( slave, current_url)) # check url in robots rp = urllib.robotparser.RobotFileParser() try: url = urlparse(current_url) rp.set_url(url.scheme + '://' + url.netloc + '/robots.txt') self.read_robots(rp) if not rp.can_fetch(self.UTILS.USER_AGENT, current_url): continue except: continue # set to visited current url self.visited[url.scheme + '://' + url.netloc + url.path] = True self.master.run(slave, data=current_url) self.limit -= 1 print('Limit: {}'.format(self.limit)) # # reclaim slaves that have finished working # so that we can assign them more work # for slave in self.master.get_completed_slaves(): done, code, file_path, url = self.master.get_data(slave) if done: if code == 200: self.get_links(file_path, url) else: new_location = file_path if code == 301: try: self.visited.pop(url.geturl(), None) except: pass if new_location not in self.url_queue and new_location not in self.visited: self.url_queue.insert(0, new_location) else: print( 'Failed to process the url: {0} --- Response code: {1}' .format(url.geturl(), code)) if self.limit <= 0: self.terminate_slaves() def get_links(self, html_file, url): with open(html_file, 'r') as file: soup = BeautifulSoup(file, features="html.parser") meta_robots = soup.find("meta", attrs={"name": "robots"}) # if robots meta exists, check if the following is allowed if meta_robots: meta_robots_content = meta_robots['content'] if 'nofollow' in meta_robots_content: return for a in soup.find_all('a', href=True): href = a['href'].strip() if href == '' or href[0] == '#': continue if '#' in href: href = ''.join(href.split('#')[:-1]) if 'https' in href or 'http' in href and href not in self.visited: self.url_queue.append(href) else: new_url = urljoin(url.geturl(), href) if new_url not in self.visited and new_url not in self.url_queue: self.url_queue.append(new_url) @timeout(0.7) def read_robots(self, rp): rp.read()