class ChatMessageStore: def __init__(self): self.__id = 1 self.__storage = [] self.__rwlock = RWLock() def add_message(self, message): self.__rwlock.acquire_write() self.__storage.append((self.__id, message)) self.__id += 1 self.__rwlock.release() def get_messages(self, start_id=0): self.__rwlock.acquire_read() result = ([message for id_, message in self.__storage if id_ >= start_id], self.__id) self.__rwlock.release() return result
class TweetWorker(object): def __init__(self, n, callback): self.processing_users = [] self.lock = RWLock() self.jobs = ThreadQueue.Queue() self.results = ThreadQueue.Queue() self.processes = [] for _ in range(0, n): s = Thread(target=worker, args=(job_queue, self.results)) self.processes.append(s) s.daemon = True s.start() print('Started {} worker processes'.format(len(self.processes))) self.consumer = Thread(target=consumer_process, args=(self.results, callback)) self.consumer.daemon = True self.consumer.start() print('Started consumer process') def get(self, username): if username is None: return self.lock.acquire_read() if username in self.processing_users: self.lock.release() return self.lock.release() self.lock.acquire_write() self.processing_users.append(username) self.lock.release() job_queue.put(username) return
class DeepDist: def __init__(self, model, master='127.0.0.1:5000', min_updates=0, max_updates=4096): """DeepDist - Distributed deep learning. :param model: provide a model that can be trained in parallel on the workers """ self.model = model self.lock = RWLock() self.descent = lambda model, gradient: model self.master = master self.state = 'serving' self.served = 0 self.received = 0 #self.server = None self.pmodel = None self.min_updates = min_updates self.max_updates = max_updates def __enter__(self): Thread(target=self.start).start() # self.server = Process(target=self.start) # self.server.start() return self def __exit__(self, type, value, traceback): # self.server.terminate() pass # need to shut down server here def start(self): from flask import Flask, request app = Flask(__name__) @app.route('/') def index(): return 'DeepDist' @app.route('/model', methods=['GET', 'POST', 'PUT']) def model_flask(): i = 0 while (self.state != 'serving' or self.served >= self.max_updates) and (i < 1000): time.sleep(1) i += 1 # pickle on first read pmodel = None self.lock.acquire_read() if not self.pmodel: self.lock.release() self.lock.acquire_write() if not self.pmodel: print(self.model) self.pmodel = pickle.dumps(self.model, -1) self.served += 1 pmodel = self.pmodel self.lock.release() else: self.served += 1 pmodel = self.pmodel self.lock.release() return pmodel @app.route('/update', methods=['GET', 'POST', 'PUT']) def update_flask(): gradient = pickle.loads(request.data) self.lock.acquire_write() if self.min_updates <= self.served: state = 'receiving' self.received += 1 self.descent(self.model, gradient) if self.received >= self.served and self.min_updates <= self.received: self.received = 0 self.served = 0 self.state = 'serving' self.pmodel = None self.lock.release() return 'OK' print 'Listening to 0.0.0.0:5000...' app.run(host='0.0.0.0', debug=True, threaded=True, use_reloader=False) def train(self, rdd, gradient, descent): master = self.master # will be pickled if master == None: master = rdd.ctx._conf.get('spark.master') if master.startswith('local['): master = 'localhost:5000' else: if master.startswith('spark://'): master = '%s:5000' % urlparse.urlparse(master).netloc.split(':')[0] else: master = '%s:5000' % master.split(':')[0] print '\n*** Master: %s\n' % master self.descent = descent def mapPartitions(data): #a = fetch_model(master=master) #print(a) aa = gradient(fetch_model(master=master), data) bb = pickle.dumps(aa) #print aa return [send_gradient(gradient(fetch_model(master=master), data), master=master)] return rdd.mapPartitions(mapPartitions).collect()
class DeepDist: def __init__(self, model, master='127.0.0.1:5000', min_updates=0, max_updates=4096): """DeepDist - Distributed deep learning. :param model: provide a model that can be trained in parallel on the workers """ self.model = model self.lock = RWLock() self.descent = lambda model, gradient: model self.master = master self.state = 'serving' self.served = 0 self.received = 0 #self.server = None self.pmodel = None self.min_updates = min_updates self.max_updates = max_updates print("THIS IS THE MASTER") print(self.master) print("\n") print("THIS IS THE MODEL 1") print(self.model) print("\n") def __enter__(self): Thread(target=self.start).start() # self.server = Process(target=self.start) # self.server.start() return self def __exit__(self, type, value, traceback): # self.server.terminate() pass # need to shut down server here def start(self): from flask import Flask, request app = Flask(__name__) @app.route('/') def index(): return 'DeepDist' @app.route('/model', methods=['GET', 'POST', 'PUT']) def model_flask(): i = 0 while (self.state != 'serving' or self.served >= self.max_updates) and (i < 1000): time.sleep(1) i += 1 # pickle on first read pmodel = None self.lock.acquire_read() if not self.pmodel: self.lock.release() self.lock.acquire_write() if not self.pmodel: self.pmodel = pickleDumper.dumps(self.model, -1) self.served += 1 pmodel = self.pmodel self.lock.release() else: self.served += 1 pmodel = self.pmodel self.lock.release() return pmodel @app.route('/update', methods=['GET', 'POST', 'PUT']) def update_flask(): gradient = pickle.loads(request.data) self.lock.acquire_write() if self.min_updates <= self.served: state = 'receiving' self.received += 1 self.descent(self.model, gradient) if self.received >= self.served and self.min_updates <= self.received: self.received = 0 self.served = 0 self.state = 'serving' self.pmodel = None self.lock.release() return 'OK' print 'Listening to 0.0.0.0:5000...' app.run(host='0.0.0.0', debug=True, threaded=True, use_reloader=False) def train(self, rdd, gradient, descent): master = self.master # will be pickled print("MASTER ROUND 2") print(master) if master == None: master = rdd.ctx._conf.get('spark.master') if master.startswith('local['): master = 'localhost:5000' else: if master.startswith('spark://'): master = '%s:5000' % urlparse.urlparse(master).netloc.split( ':')[0] else: master = '%s:5000' % master.split(':')[0] print '\n*** Master: %s\n' % master self.descent = descent def mapPartitions(data): return [ send_gradient(gradient(fetch_model(master=master), data), master=master) ] return rdd.mapPartitions(mapPartitions).collect()
class DeepDist: def __init__(self, model, master='127.0.0.1:5000', min_updates=0, max_updates=4096): """DeepDist - Distributed deep learning. :param model: provide a model that can be trained in parallel on the workers """ self.model = model self.lock = RWLock() self.descent = lambda model, gradient: model self.master = master self.state = 'serving' self.served = 0 self.received = 0 self.server = '0.0.0.0' self.pmodel = None self.min_updates = min_updates self.max_updates = max_updates def __enter__(self): Thread(target=self.start).start() # self.server = Process(target=self.start) # self.server.start() return self def __exit__(self, type, value, traceback): url = "http://%s/shutdown" % self.master response = urllib2.urlopen(url, '{}').read() print("Exit requested...") def start(self): from flask import Flask, request app = Flask(__name__) @app.route('/') def index(): return 'DeepDist' @app.route('/model', methods=['GET', 'POST', 'PUT']) def model_flask(): i = 0 while (self.state != 'serving' or self.served >= self.max_updates) and (i < 1000): time.sleep(1) i += 1 # pickle on first read pmodel = None self.lock.acquire_read() if not self.pmodel: self.lock.release() self.lock.acquire_write() if not self.pmodel: self.pmodel = pickleDumper.dumps(self.model, -1) self.served += 1 pmodel = self.pmodel self.lock.release() else: self.served += 1 pmodel = self.pmodel self.lock.release() return pmodel @app.route('/update', methods=['GET', 'POST', 'PUT']) def update_flask(): gradient = pickle.loads(request.data) self.lock.acquire_write() if self.min_updates <= self.served: state = 'receiving' self.received += 1 self.descent(self.model, gradient) if self.received >= self.served and self.min_updates <= self.received: self.received = 0 self.served = 0 self.state = 'serving' self.pmodel = None self.lock.release() return 'OK' @app.route('/shutdown', methods=['POST']) def shutdown(): func = request.environ.get('werkzeug.server.shutdown') if func is None: raise RuntimeError('Not running with the Werkzeug Server') func() return 'Server shutting down...' print 'Listening to 0.0.0.0:5000...' app.run(host='0.0.0.0', debug=True, threaded=True, use_reloader=False) def train(self, rdd, gradient, descent): master = self.master # will be pickled if master == None: master = rdd.ctx._conf.get('spark.master') if master.startswith('local['): master = 'localhost:5000' else: if master.startswith('spark://'): master = '%s:5000' % urlparse.urlparse(master).netloc.split( ':')[0] else: master = '%s:5000' % master.split(':')[0] print '\n*** Master: %s\n' % master self.descent = descent def mapPartitions(data): return [ send_gradient(gradient(fetch_model(master=master), data), master=master) ] return rdd.mapPartitions(mapPartitions).collect()
class DeepDist: def __init__(self, model, master='127.0.0.1:5000', min_updates=0, max_updates=4096): """DeepDist - Distributed deep learning. :param model: provide a model that can be trained in parallel on the workers """ self.model = model self.lock = RWLock() self.descent = lambda model, gradient: model self.master = master self.state = 'serving' self.served = 0 self.received = 0 #self.server = None self.pmodel = None self.min_updates = min_updates self.max_updates = max_updates def start_server(self): Thread(target=self.start).start() def start(self): from flask import Flask, request app = Flask(__name__) @app.route('/') def index(): return 'DeepDist' @app.route('/model', methods=['GET', 'POST', 'PUT']) def model_flask(): i = 0 while (self.state != 'serving' or self.served >= self.max_updates) and (i < 1000): time.sleep(1) i += 1 # pickle on first read pmodel = None self.lock.acquire_read() if not self.pmodel: self.lock.release() self.lock.acquire_write() if not self.pmodel: self.pmodel = pickle.dumps(self.model, -1) self.served += 1 pmodel = self.pmodel self.lock.release() else: self.served += 1 pmodel = self.pmodel self.lock.release() print "model replica weights were updated via /model" return pmodel @app.route('/update', methods=['GET', 'POST', 'PUT']) def update_flask(): gradient = pickle.loads(request.data) self.lock.acquire_write() if self.min_updates <= self.served: state = 'receiving' self.received += 1 old_syn0, old_syn1 = self.model.syn0.copy(), self.model.syn1.copy() print "received gradient: " + str(gradient) self.descent(self.model, gradient) if self.received >= self.served and self.min_updates <= self.received: self.received = 0 self.served = 0 self.state = 'serving' self.pmodel = None self.lock.release() print "server weights were updated by model replica" print "old weights: " print old_syn0[0:3, 0:3], old_syn1[ 0:3, 0:3] #printing just the first few weights print "new weights: " print self.model.syn0[0:3, 0:3], self.model.syn1[0:3, 0:3] return 'OK' print 'Listening to 0.0.0.0:5000...' app.run(host='0.0.0.0', debug=True, threaded=True, use_reloader=False) def train(self, rdd, gradient, descent): master = self.master print '\n*** Master: %s\n' % master self.descent = descent def mapPartitions(data): return [ send_gradient(gradient(fetch_model(master=master), data), master=master) ] return rdd.mapPartitions(mapPartitions).collect()
class Volumn(object): _rpc_methods = [ 'assign_volumn', 'store', 'replica', 'download', 'status', 'balance', 'migrate_volumn_to', 'migrate_volumn_from', 'delete_file', 'delete_volumn' ] def __init__(self, logger, host, port): self.logger = logger self.host = host self.port = port self.lock = RWLock() self.vdb = dict() self.fdb = dict() if os.path.isfile('vdb'): self.vdb = pickle.load(open('vdb', 'rb')) if os.path.isfile('fdb'): self.fdb = pickle.load(open('fdb', 'rb')) self.act_mst_serv = list() self.serv = ThreadXMLRPCServer((self.host, self.port), logRequests=True) for name in self._rpc_methods: self.serv.register_function(getattr(self, name)) def _update_vdb(self): pickle.dump(self.vdb, open('vdb', 'wb')) def _update_fdb(self): pickle.dump(self.fdb, open('fdb', 'wb')) def update_master(self, masters): self.act_mst_serv = masters def get_master(self): return ServerProxy(random.choice(self.act_mst_serv)) def assign_volumn(self, vid, size): path = 'data/%s' % vid if not os.path.isdir('data'): os.mkdir('data') try: self.lock.acquire_read() with open(path, 'wb') as f: f.seek(size - 1) f.write(b'\0') vdoc = dict() vdoc['vid'] = vid vdoc['path'] = path vdoc['size'] = size vdoc['counter'] = 0 self.vdb[vid] = vdoc self._update_vdb() return True except: return False finally: self.lock.release() def migrate_volumn_to(self, vid, to_addr): try: vdoc = self.vdb[vid] path = vdoc['path'] s = ServerProxy(to_addr) with open(path, 'rb') as f: while True: data = f.read(64 * 1024 * 1024) if data: s.migrate_volumn_from(vid, data, vdoc) self.logger.info('Send data...') else: fdocs = { k: v for k, v in self.fdb.items() if k.startswith('%d,' % vid) } self.logger.info('Send metadata') s.migrate_volumn_from(vid, data, vdoc, fdocs, True) break return True except: return False def migrate_volumn_from(self, vid, data, vdoc, fdocs=None, done=False): path = vdoc['path'] if not os.path.isdir('data'): os.mkdir('data') if done: self.vdb[vid] = vdoc self._update_vdb() self.fdb = {**self.fdb, **fdocs} self._update_fdb() self.logger.info('Build replica %d success' % vid) else: with open(path, 'ab') as f: f.write(data.data) return True def store(self, fid, data): vid, _ = fid.split(',') vid = int(vid) try: self.replica(fid, data) master = self.get_master() volumns = master.find_writable_volumn(vid) if not volumns: return False for volumn in volumns: if volumn != 'http://%s:%d' % (self.host, self.port): s = ServerProxy(volumn) s.replica(fid, data) return True except Exception as e: self.logger.exception('Got an exception') return False def replica(self, fid, data): data = data.data vid, _ = fid.split(',') vid = int(vid) self.lock.acquire_write() vdoc = self.vdb[vid] path = vdoc['path'] offset = vdoc['counter'] size = len(data) vdoc['counter'] += size self.lock.release() with open(path, 'r+b') as f: f.seek(offset) f.write(data) fdoc = dict() fdoc['fid'] = fid fdoc['offset'] = offset fdoc['size'] = size fdoc['delete'] = False self.vdb[vid] = vdoc self._update_vdb() self.fdb[fid] = fdoc self._update_fdb() return True def update_file(self, fid, data): pass def delete_file(self, fid, sync=True): vid, _ = fid.split(',') vid = int(vid) try: if sync: master = self.get_master() volumns = master.find_writable_volumn(vid) if not volumns: return False for volumn in volumns: if volumn != 'http://%s:%d' % (self.host, self.port): s = ServerProxy(volumn) s.delete_file(fid, False) fdoc = self.fdb[fid] fdoc['delete'] = True self._update_fdb() return True except Exception as e: self.logger.exception('Got an exception') return False def delete_volumn(self, vid): pass def download(self, fid): vid, _ = fid.split(',') vid = int(vid) if vid not in self.vdb or fid not in self.fdb: return None try: self.lock.acquire_read() vdoc = self.vdb[vid] fdoc = self.fdb[fid] if fdoc['delete'] == True: return None path = vdoc['path'] offset = fdoc['offset'] size = fdoc['size'] with open(path, 'rb') as f: f.seek(offset) data = f.read(size) return data except: return None finally: self.lock.release() def balance(self, vid): try: self.lock.acquire_write() vdoc = self.vdb[vid] fdocs = self.fdb tfdocs = fdocs.copy() tvdoc = vdoc.copy() tvdoc['counter'] = 0 path = vdoc['path'] size = vdoc['size'] with open(path + '.tmp', 'wb') as f: f.seek(size - 1) f.write(b'\0') with open(path, 'r+b') as from_file, open(path + '.tmp', 'r+b') as to_file: to_file.seek(0) for fdoc in fdocs.values(): if fdoc['fid'].startswith( '%d,' % vid) and fdoc['delete'] == False: from_file.seek(fdoc['size']) data = from_file.read(fdoc['size']) to_file.write(data) tfdoc = fdoc.copy() tfdoc['offset'] = tvdoc['counter'] tvdoc['counter'] += fdoc['size'] tfdocs[fdoc['fid']] = tfdoc os.remove(path) os.rename(path + '.tmp', path) self.vdb[vid] = tvdoc self.fdocs = tfdocs return True except: self.logger.exception('Got an exception') return False finally: self.lock.release() def status(self): status = dict() total, used, free = shutil.disk_usage(__file__) status['total'] = str(total) status['used'] = str(used) status['free'] = str(free) status['vdb'] = {str(vid): vdoc for vid, vdoc in self.vdb.items()} return status def start(self): self.logger.info('Start serving at %s:%d' % (self.host, self.port)) self.serv.serve_forever()
class MSGControler: #def __init__(self, listen_ip, listen_port, log_file, debug): def __init__(self, configDic): """ Init Controler class """ self.running = True self.config = configDic self.log_file = self.config["log"] self.debug = self.config["debug"] self.security = self.config["security"] self.clients = {} self.lock = RWLock() ## Start TCP comm server ## listen_ip = self.config["listen_ip"] listen_port = self.config["listen_port"] try: self.server = tcpServer((listen_ip,listen_port), handleConnection, self.clients, self.debug, self.security ) except: self.log_error("Unable to bind TCP socket %s:%s !" % (listen_ip,listen_port)) proc = subprocess.Popen(["ss", "-pant"], stdout=subprocess.PIPE) code = proc.wait() for aline in proc.stdout: if (str(listen_ip)+':'+str(listen_port)) in aline and "LISTEN" in aline: tmpstr1 = re.sub(').*', '', re.sub('.*(', '', aline)) pid = re.sub(',.*', '', re.sub('.*pid=', tmpstr1)) prog = re.sub('.*"', '', re.sub('",.*', '', aline)) self.log_warning("Process %s, PID %s, is binding port %s. It will be killed." % (prog, pid, listen_port)) os.system("kill -9 %s" % pid) time.sleep(10) self.log_info("Trying again to bind %s on %s." % (listen_port, listen_ip)) self.server = tcpServer((listen_ip,listen_port), handleConnection, self.clients, self.debug, self.security ) self.comm_thread = threading.Thread(target=self.server.serve_forever) self.comm_thread.daemon = True self.comm_thread.start() ##### Send a keepalive message every minutes (60 sec) ## self.keepalive = KeepAliveTimer(60, self.send_keepalive, ["KeepAliveTimer"]) self.keepalive.start() def log_error(self, newline): self.log(newline, "ERROR") def log_warning(self, newline): self.log(newline, "WARNING") def log_info(self, newline): self.log(newline, "INFO") def log_event(self, newline): self.log(newline, "EVENT") def log_debug(self, newline): if self.debug == True : self.log(newline, "DEBUG") def log(self, newline, level="INFO"): LOG_SIZE = os.path.getsize(self.log_file) # if > 1M create a new file if LOG_SIZE > 1000000: if os.path.exists(self.log_file+".4"): os.remove(self.log_file+".4") os.rename(self.log_file+".3", self.log_file+".4") if os.path.exists(self.log_file+".3"): os.rename(self.log_file+".3", self.log_file+".4") if os.path.exists(self.log_file+".2"): os.rename(self.log_file+".2", self.log_file+".3") if os.path.exists(self.log_file+".1"): os.rename(self.log_file+".1", self.log_file+".2") os.rename(self.log_file, self.log_file+".1") if os.path.exists('/opt/virtualisation/openkvi/debug'): os.remove('/opt/virtualisation/openkvi/debug') logs = open(self.log_file,'w') else: logs = open(self.log_file,'a') timestamp = time.strftime("%Y-%m-%d %H:%M:%S") logs.write(timestamp+"::["+level+"]::"+newline+"\n") logs.close() def print_debug(self, msg): if self.debug == True : self.log_debug(msg) print msg def tell_all(self, event, data): self.keepalive.stop() self.print_debug("telling all %s %s"% (event, data)) line = event+";"+json.dumps(data) ## Acquire lock so that no messages are sent ## simultanously self.lock.acquire_write() res = self.server.writeToAll(line) ## Wait 500 ms between two message to prevent ## clients being overwhelmed time.sleep(0.5) self.lock.release() self.keepalive.start() def stop(self): self.print_debug("stop tcp server") self.keepalive.stop() self.server.socket.close() def send_keepalive(self): res = self.server.writeToAll("keep alive")
class tcpServer(SocketServer.ThreadingTCPServer): def __init__(self, server_address, RequestHandlerClass, clients, debug, security): self.allow_reuse_address = True SocketServer.ThreadingTCPServer.__init__(self,server_address,RequestHandlerClass) self.clients = clients #self.arg2 = arg2 self.rwlock = RWLock() self.debug = debug self.security = security def print_debug(self, msg): if self.debug == True : print msg def writeToAll(self, line): lines = [] lines.append(line) msg = {} msg['messages'] = lines try: self.rwlock.acquire_read() keys = self.clients.keys() self.rwlock.release() except: err = str(sys.exc_info()[1]).strip("'") return 0 for an_id in keys: self.clients[an_id]['lock'].acquire_write() conn = self.clients[an_id]['connection'] try : #self.print_debug("trying to say %s to %s" % (line.strip(), an_id)) conn.wfile.write(json.dumps(msg)+"\n") self.clients[an_id]['lock'].release() except: self.print_debug("Not able to speak to %s" % an_id) self.clients[an_id]['lock'].release() timestamp = self.clients[an_id]["timestamp"] if timestamp != 0: timeout = datetime.datetime.now() - timestamp if timeout > datetime.timedelta(seconds = 20): self.print_debug("connection to %s timed out !" % an_id) try: conn.finish() except: self.print_debug("connection to %s is already finished" % an_id) del self.clients[an_id] if self.security != "low": #curl -k https://localhost/_auth/unset/?id=12345" url = 'https://localhost/_auth/unset/?id='+an_id proc = subprocess.Popen(['curl','-k',url], stdout=subprocess.PIPE) code = proc.wait() else: self.clients[an_id]['lock'].acquire_write() self.clients[an_id]['timestamp'] = datetime.datetime.now() self.clients[an_id]['messages'].append(line) self.clients[an_id]['lock'].release() return 0
class DeepDist: def __init__(self, model, master='127.0.0.1:5000', min_updates=0, max_updates=4096): """DeepDist - Distributed deep learning. :param model: provide a model that can be trained in parallel on the workers """ self.model = model self.lock = RWLock() self.descent = lambda model, gradient: model self.master = master self.state = 'serving' self.served = 0 self.received = 0 #self.server = None self.pmodel = None self.min_updates = min_updates self.max_updates = max_updates def start_server(self): Thread(target=self.start).start() def start(self): from flask import Flask, request app = Flask(__name__) @app.route('/') def index(): return 'DeepDist' @app.route('/model', methods=['GET', 'POST', 'PUT']) def model_flask(): i = 0 while (self.state != 'serving' or self.served >= self.max_updates) and (i < 1000): time.sleep(1) i += 1 # pickle on first read pmodel = None self.lock.acquire_read() if not self.pmodel: self.lock.release() self.lock.acquire_write() if not self.pmodel: self.pmodel = pickle.dumps(self.model, -1) self.served += 1 pmodel = self.pmodel self.lock.release() else: self.served += 1 pmodel = self.pmodel self.lock.release() print "model replica weights were updated via /model" return pmodel @app.route('/update', methods=['GET', 'POST', 'PUT']) def update_flask(): gradient = pickle.loads(request.data) self.lock.acquire_write() if self.min_updates <= self.served: state = 'receiving' self.received += 1 old_syn0, old_syn1 = self.model.syn0.copy(), self.model.syn1.copy() print "received gradient: " + str(gradient) self.descent(self.model, gradient) if self.received >= self.served and self.min_updates <= self.received: self.received = 0 self.served = 0 self.state = 'serving' self.pmodel = None self.lock.release() print "server weights were updated by model replica" print "old weights: " print old_syn0[0:3, 0:3], old_syn1[0:3, 0:3] #printing just the first few weights print "new weights: " print self.model.syn0[0:3, 0:3], self.model.syn1[0:3, 0:3] return 'OK' print 'Listening to 0.0.0.0:5000...' app.run(host='0.0.0.0', debug=True, threaded=True, use_reloader=False) def train(self, rdd, gradient, descent): master = self.master print '\n*** Master: %s\n' % master self.descent = descent def mapPartitions(data): return [send_gradient(gradient(fetch_model(master=master), data), master=master)] return rdd.mapPartitions(mapPartitions).collect()
class DeepDist: def __init__(self, model, master="127.0.0.1:5000", min_updates=0, max_updates=4096): """DeepDist - Distributed deep learning. :param model: provide a model that can be trained in parallel on the workers """ self.model = model self.lock = RWLock() self.descent = lambda model, gradient: model self.master = master self.state = "serving" self.served = 0 self.received = 0 # self.server = None self.pmodel = None self.min_updates = min_updates self.max_updates = max_updates def __enter__(self): Thread(target=self.start).start() # self.server = Process(target=self.start) # self.server.start() return self def __exit__(self, type, value, traceback): # self.server.terminate() pass # need to shut down server here def start(self): from flask import Flask, request app = Flask(__name__) @app.route("/") def index(): return "DeepDist" @app.route("/model", methods=["GET", "POST", "PUT"]) def model_flask(): i = 0 while (self.state != "serving" or self.served >= self.max_updates) and (i < 1000): time.sleep(1) i += 1 # pickle on first read pmodel = None self.lock.acquire_read() if not self.pmodel: self.lock.release() self.lock.acquire_write() if not self.pmodel: self.pmodel = pickle.dumps(self.model, -1) self.served += 1 pmodel = self.pmodel self.lock.release() else: self.served += 1 pmodel = self.pmodel self.lock.release() return pmodel @app.route("/update", methods=["GET", "POST", "PUT"]) def update_flask(): gradient = pickle.loads(request.data) self.lock.acquire_write() if self.min_updates <= self.served: state = "receiving" self.received += 1 self.descent(self.model, gradient) if self.received >= self.served and self.min_updates <= self.received: self.received = 0 self.served = 0 self.state = "serving" self.pmodel = None self.lock.release() return "OK" print "Listening to 0.0.0.0:5000..." app.run(host="0.0.0.0", debug=True, threaded=True, use_reloader=False) def train(self, rdd, gradient, descent): master = self.master # will be pickled if master == None: master = rdd.ctx._conf.get("spark.master") if master.startswith("local["): master = "localhost:5000" else: if master.startswith("spark://"): master = "%s:5000" % urlparse.urlparse(master).netloc.split(":")[0] else: master = "%s:5000" % master.split(":")[0] print "\n*** Master: %s\n" % master self.descent = descent def mapPartitions(data): return [send_gradient(gradient(fetch_model(master=master), data), master=master)] return rdd.mapPartitions(mapPartitions).collect()
class DeepDist: def __init__(self, model, batch=None, master='127.0.0.1:5000'): """DeepDist - Distributed deep learning. :param model: provide a model that can be trained in parallel on the workers """ self.model = model self.lock = RWLock() self.descent = lambda model, gradient: model self.master = master self.state = 'serving' self.served = 0 self.received = 0 self.batch = batch self.server = None def __enter__(self): Thread(target=self.start).start() # self.server = Process(target=self.start) # self.server.start() return self def __exit__(self, type, value, traceback): # self.server.terminate() pass # need to shut down server here def start(self): from flask import Flask, request app = Flask(__name__) @app.route('/') def index(): return 'DeepDist' @app.route('/model', methods=['GET', 'POST', 'PUT']) def model_flask(): i = 0 while (self.state != 'serving') and (i < 1000): time.sleep(1) i += 1 self.lock.acquire_read() self.served += 1 model = copy.deepcopy(self.model) self.lock.release() return pickle.dumps(model, -1) @app.route('/update', methods=['GET', 'POST', 'PUT']) def update_flask(): gradient = pickle.loads(request.data) self.lock.acquire_write() state = 'receiving' self.received += 1 self.descent(self.model, gradient) if self.received >= self.served: self.received = 0 self.served = 0 self.state = 'serving' self.lock.release() return 'OK' print 'Listening to 0.0.0.0:5000...' app.run(host='0.0.0.0', debug=True, threaded=True, use_reloader=False) def train(self, rdd, gradient, descent): master = self.master # will be pickled print 'master0: ', master if master == None: master = rdd.ctx._conf.get('spark.master') print 'master1: ', master if master.startswith('local['): master = 'localhost:5000' else: if master.startswith('spark://'): master = '%s:5000' % urlparse.urlparse(master).netloc.split(':')[0] else: master = '%s:5000' % master.split(':')[0] print '\n*** master: %s\n' % master self.descent = descent batch = self.batch def mapPartitions(data): last = 'dummy' class Iter: def __iter__(self): self.i = 0 return self def next(self): if (batch == None) or (self.i < batch): self.i += 1 last = data.next() return last else: return None res = [] while last != None: res.append(send_gradient(gradient(fetch_model(master=master), Iter()), master=master)) return res return rdd.mapPartitions(mapPartitions).collect()
class DeepDist: def __init__(self, model, host='127.0.0.1:5000'): """DeepDist - Distributed deep learning. :param model: provide a model that can be trained in parallel on the workers """ self.model = model self.lock = RWLock() self.descent = lambda model, gradient: model self.host = host self.state = 'serving' self.served = 0 self.received = 0 def __enter__(self): Thread(target=self.start).start() return self def __exit__(self, type, value, traceback): pass # need to shut down server here def start(self): from flask import Flask, request app = Flask(__name__) @app.route('/') def index(): return 'DeepDist' @app.route('/model', methods=['GET', 'POST', 'PUT']) def model_flask(): i = 0 while (self.state != 'serving') and (i < 20): time.sleep(1) i += 1 self.lock.acquire_read() self.served += 1 model = copy.deepcopy(self.model) self.lock.release() return pickle.dumps(model, -1) @app.route('/update', methods=['GET', 'POST', 'PUT']) def update_flask(): gradient = pickle.loads(request.data) self.lock.acquire_write() state = 'receiving' self.received += 1 self.descent(self.model, gradient) if self.received >= self.served: self.received = 0 self.served = 0 self.state = 'serving' self.lock.release() return 'OK' print 'Listening to 0.0.0.0:5000...' app.run(host='0.0.0.0', debug=True, threaded=True, use_reloader=False) def train(self, rdd, gradient, descent): self.descent = descent host = self.host # will be pickled by rdd.mapPartitions def mapPartitions(data): return (send_gradient(gradient(fetch_model(host=host), data), host=host)) return rdd.mapPartitions(mapPartitions).collect()