def run_executable(list, live=None, socketid=None): logger = Logger() logger.open('P') logger.write('P','Starting Execution') logger.close('P') conn = sqlite3.connect('/var/www/html/cloudcv/db/message.db') c = conn.cursor() try: popen = subprocess.Popen(list, bufsize=1, stdin=open(os.devnull), stderr=subprocess.STDOUT, stdout=subprocess.PIPE) count = 0 complete_output='' line = '' errline = '' r.publish('chat', json.dumps({'error': str(popen.pid), 'socketid': socketid})) #popen.communicate() #p = Popen(cmd, bufsize=1, stdin=open(os.devnull), stdout=PIPE, stderr=STDOUT) lines = [] while True: if popen.stdout: line = popen.stdout.readlines(10) if live is True: r.publish('chat', json.dumps({'error': str(line), 'socketid': socketid})) popen.stdout.flush() if popen.stderr: errline = popen.stdout.readlines(10) if live is True: r.publish('chat', json.dumps({'error': str(errline), 'socketid': socketid})) popen.stderr.flush() if line: #print count, line, '\n' complete_output += str(line) count += 1 if errline: #print count, errline, '\n' complete_output += str(errline) count += 1 if popen.poll() is not None: break else: r.publish('chat', json.dumps({'error': str(popen.poll()), 'socketid': socketid})) return complete_output except Exception as e: raise e conn.close()
def main(): hud = Hud() # ser = serial.Serial(port = '/dev/ttyUSB0', # baudrate = 9600, # parity = serial.PARITY_NONE, # stopbits = serial.STOPBITS_ONE, # bytesize = serial.EIGHTBITS, # timeout = 1) logger = 0 kv = {} stringKV = {} firstPass = True while True: # wrapping this all in a try-catch should handle file-opening/-closing timeline nicely # i.e. catch TclError(possibly tclError): close file # # var parsing and file writing will need to be wrapped in classes sleep(0.5) #kv = RT.randomKV() #print kv try: #inLine = ser.readline() inLine = RT.randomKVLine() if inLine[0] == '@': newKV = parseLineToKV(inLine) newStringKV = parseLineToStringKV(inLine) if firstPass: keys = sorted(newStringKV.keys()) logger = Logger(keys) firstPass = False if newKV != kv: kv = newKV stringKV = newStringKV hud.updateHud(kv) hud.update_idletasks() hud.update() logger.logKV(stringKV) print strfkv(stringKV) except TclError: #possibly tclError print "HUD was closed" logger.close() break except err: print err break
def run_executable(list, live=None, socketid=None): logger = Logger() logger.open('P') logger.write('P', 'Starting Execution') logger.close('P') conn = sqlite3.connect('/var/www/html/cloudcv/db/message.db') c = conn.cursor() try: popen = subprocess.Popen(list, bufsize=1, stdin=open(os.devnull), stderr=subprocess.STDOUT, stdout=subprocess.PIPE) count = 0 complete_output = '' line = '' errline = '' r.publish('chat', json.dumps({ 'error': str(popen.pid), 'socketid': socketid })) #popen.communicate() #p = Popen(cmd, bufsize=1, stdin=open(os.devnull), stdout=PIPE, stderr=STDOUT) lines = [] while True: if popen.stdout: line = popen.stdout.readlines(10) if live is True: r.publish( 'chat', json.dumps({ 'error': str(line), 'socketid': socketid })) popen.stdout.flush() if popen.stderr: errline = popen.stdout.readlines(10) if live is True: r.publish( 'chat', json.dumps({ 'error': str(errline), 'socketid': socketid })) popen.stderr.flush() if line: #print count, line, '\n' complete_output += str(line) count += 1 if errline: #print count, errline, '\n' complete_output += str(errline) count += 1 if popen.poll() is not None: break else: r.publish( 'chat', json.dumps({ 'error': str(popen.poll()), 'socketid': socketid })) return complete_output except Exception as e: raise e conn.close()
source_type, socketid, output, result_path, result_url, dropbox_token=db_token) r.publish( 'chat', json.dumps({ 'message': str('Features Generated'), 'socketid': str(socketid), 'token': token, 'jobid': jobid })) except Exception as e: print e log(e, '__main__') logger = Logger() logger.open('E') logger.write('E', str(traceback.format_exc())) logger.close('E') r.publish( 'chat', json.dumps({ 'error': str(traceback.format_exc()), 'socketid': parsed_dict['socketid'] })) logger.close('P')
sendsMessageToRedis(userid, jobid, source_type, socketid, output, result_path, result_url, dropbox_token=db_token) r.publish('chat', json.dumps({'message': str('Bounding Boxes Generated'), 'socketid': str(socketid), 'token': token, 'jobid': jobid})) elif(parsed_dict['exec'] == 'classify'): run_classification(parsed_dict['userid'], parsed_dict['jobid'], parsed_dict['image_path'], parsed_dict['socketid'], parsed_dict['token'], parsed_dict['source_type'],result_path, db_token=db_token) elif(parsed_dict['exec'] == 'features'): output = '' if(list[-2] != ''): output += run_executable(list) + '\n' list_decaf = ['python', '/var/www/html/cloudcv/fileupload/executable/decaf_cal_feature.py', image_path, result_path, str(flag), parsed_dict['socketid']] output += run_executable(list_decaf, live=False, socketid=parsed_dict['socketid']) sendsMessageToRedis(userid, jobid, source_type, socketid, output, result_path, result_url, dropbox_token=db_token) r.publish('chat', json.dumps({'message': str('Features Generated'), 'socketid': str(socketid), 'token': token, 'jobid': jobid})) except Exception as e: print e log(e, '__main__') logger = Logger() logger.open('E') logger.write('E', str(traceback.format_exc())) logger.close('E') r.publish('chat', json.dumps({'error': str(traceback.format_exc()), 'socketid': parsed_dict['socketid']})) logger.close('P')
class ParallelSGD: """ P-SGD 主调类 P-SGD RPC Controller """ def __init__(self, model: Model, data: AbsDataset, transform: ITransformer): """ 初始化一个P-SGD主调对象 :param model: 用于数据并行化的模型。 :param data: 用于并行化的数据集。 :param transform: 数据集处理转换策略。Dataset在每个节点上都是可见的,由 transform 对数据集 进行状态转换处理,数据集的处理操作是本地化执行的,数据集转换策略是链表形式 组织的,以管道的形式运行,数据集依次经过每个 transform 操作,最终进入由 BatchIter 调度。 """ self.__model = model self.__data = data self.__transform = transform self.__log = Logger(title_info="P-SGD Submit", log_to_file=True) def parallel( self, nodes: NodeAssignment, redundancy: int = 1, block_size: int = 64, epoch: int = 10, assignment_type: Type[AbsBlockAssignment] = IIDBlockAssignment, sync_type: Type[ISyncType] = SynchronizedSGD, op_type: Type[IOptimizer] = PSGDOptimizer, gd_type: Type[IGradientDescent] = ADAMOptimizer, codec: Union[Dict[int, Type[Codec]], Type[Codec]] = None, gd_params: Tuple[object] = (), ps_codec: Union[Dict[int, Type[Codec]], Type[Codec], None] = None, network_bandwidth: int = 1048576, mission_title: str = "P-SGD", ssgd_timeout_limit: int = 10000, codec_extra_parameters: Dict[Hashable, SupportsFloat] = None ) -> Dict[str, SupportsFloat]: """ 执行并行化。 :param ssgd_timeout_limit: Sync-SGD等待超时限制,单位为毫秒,数值为整型。 :param network_bandwidth: 可用的网络带宽,用作计算预估传输时间,设置 pre_commit 超时计时器。 :param mission_title: 任务标题,作为本次任务的log文件文件名。 :param nodes: 由 network 模块提供的 NodeAssignment 接口,指示了当前并行化操作调用的节点数目。 参数服务器的节点编号由 utils.constant.Parameter_Server 指定,其余工作节点的id 从 0 开始依次递增(为整数型)。 :param redundancy: 冗余设置,适用于能够处理冗余的 codec 和 block assignment。 继承自 AbsBlockAssignment 总能够处理含有冗余参数的提交任务,codec 的冗余处理则由 codec 自行定义。 :param block_size: 节点粒度的 Batch 大小,由 codec 控制具体的更新策略,块大小与批次大小并没有具体对应关系。 若 codec 在每个 Block 后给出结果,则实践意义上 Block size 和 Batch size 是等价的, 若 codec 总是等待所有 Block 的训练完成后再同步结果,则实践意义上 Batch size 等于 Block size 乘以 Block 总数。 :param epoch: 训练批次数,由 codec 和 sync type 共同决定 epoch 内的同步策略,当使用参数服务器时,参数服务器 也参与到同步状态的维持中。 若 codec 不允许异步执行,则所有节点都会在同一时刻结束 epoch,若 codec 或 sync type 允许跨批次 执行,则节点会根据自己的计算能立先后结束计算。 :param assignment_type: 样本分配策略,一般与冗余分配结合使用,需要实现了 profiles.ISetting 接口的类型。 初衷是提供冗余的数据集分配策略,现可以提供静态数据量分配。 :param sync_type: 同步方式。分同步和异步两种,需要实现了 parallel_sgd.sync.IParallelSGD 接口的类型。 同步方式下,每个 worker 在调用 get_weights() 获取权重时才会处理接收数据。 异步方式下,每个 Worker 收到数据就处理并更新结果集。 具体的数据处理流程和结果集更新策略都由 codec 定义。 :param gd_type: 梯度处理策略类型,实现了 nn.IOptimizer 接口的类型。 负责处理梯度更新策略。 :param op_type: 梯度生成策略,实现了 nn.gradient_descent.IGradientDescent 接口的类型。 负责生成待处理的更新增量。 :param codec: 编码器类型,实现了 codec.interface.Codec 接口的类型。 :param gd_params: 梯度生成器参数 :param ps_codec: 编码器类型,实现了 codec.interface.Codec 接口的类型。 用于参数服务器进行数据处理。 :param codec_extra_parameters:用于Codec接口识别的其他参数列表,为字典形式。该字典将会存储在每个Worker的 codec.GlobalSettings.__global_parameters 参数中,Codec对象可以使用 GlobalSettings.get_params(key: str) -> object 函数获取对应key的值。 :return: Dict,代表全局执行结果,平均准确率和损失,以及在Model中定义的其他评判指标。 """ # 初始化适合的Codec if codec_extra_parameters is None: codec_extra_parameters = {} if codec is None: codec = dict() if ps_codec is None: ps_codec = dict() # 默认填充Codec default_codec = DummyCodec default_ps_codec = DummyCodec # 如果传入确定的Codec if isinstance(codec, type): default_codec = codec codec = dict() if isinstance(ps_codec, type): default_ps_codec = ps_codec ps_codec = dict() # 获取所有的合法Slave node_count = 0 has_ps = False for _id, _ in nodes: if _id >= 0: node_count += 1 else: has_ps = True # 任务分配策略 assignment: ISetting = assignment_type(node_count, redundancy) # 分配策略实例 setting: net_setting = net_setting(assignment_type, node_count, redundancy) # 模型实例 model: net_model = net_model( self.__model, BatchIter(block_size, assignment.block_count)) # 优化器实例 optimizer: net_optimizer = net_optimizer(op_type, gd_type, op_params=gd_params) # 变量表 var_ids = [var.id for var in self.__model.trainable_variables()] # 变量表Codec字典 var_codec = { var_id: (sync_type, codec.get(var_id, default_codec)) for var_id in var_ids } # Transfer 实例 transfer_worker: net_transfer = net_transfer(var_codec) # PS Codec 变量表字典 var_ps_codec = { var_id: (AsynchronizedSGD, ps_codec.get(var_id, default_ps_codec)) for var_id in var_ids } # PS Transfer 实例 transfer_ps: [net_transfer ] = net_transfer(var_ps_codec) if has_ps else None # 其他信息 misc: misc_package = misc_package(mission_title, epoch, None, ssgd_timeout_limit) replies = { Req.Model: model, Req.Setting: setting, Req.Optimizer: optimizer, Req.Transfer: transfer_worker, Req.Transfer_PS: transfer_ps, Req.Other_Stuff: misc, Req.Extra_Content: extra_package(codec_extra_parameters), Req.Data_Package: data_package(self.__data, self.__transform), Req.Data_Content: data_content(self.__data, self.__transform) } req = Request() self.__log.log_message("Start job.") self.__log.log_message("Workers: {}".format(nodes)) with req.request(nodes) as com: coordinator = Coordinator(com, estimate_bandwidth=network_bandwidth, logger=self.__log) if has_ps: coordinator.submit_single(PSGDPSExecutor, Parameter_Server, self.__data.estimate_size()) coordinator.submit_group(PSGDWorkerExecutor, assignment.nodes, self.__data.estimate_size()) coordinator.resources_dispatch(lambda _id, x: replies[x]) res, err = coordinator.join() self.__log.close() # 获取每个节点的返回值 ret: Dict[str, float] = {} for node in res: if isinstance(res, Dict) and isinstance(res[node], Dict): for key in res[node]: ret[key] = ret.get(key, 0) + res[node][key] # 求平均值 for key in ret: ret[key] = ret[key] / len(res) return ret
opt.zero_grad() image = image.to(device=device, dtype=torch.float32) label = label.to(device=device, dtype=torch.float32) pred=net(image) loss = loss_fun(pred, label) loss.backward() i = i + 1 running_loss = running_loss+loss.item() opt.step() end = time.clock() loss_avg_epoch = running_loss/i Unet_train_txt.write(str(format(loss_avg_epoch, '.4f')) + '\n') print('epoch: %d avg loss: %f time:%d s' % (epoch, loss_avg_epoch, end - begin)) if loss_avg_epoch < bes_los: bes_los = loss_avg_epoch state = {'net': net.state_dict(), 'opt': opt.state_dict(), 'epoch': epoch} torch.save(state, 'model_pth') if __name__ == '__main__': device = torch.device("cuda" if torch.cuda.is_available() else "cpu") net = UNet(1, 1, bilinear=False) #print(net) net.to(device=device) #对应数据集的路径 data_path = "D:\\BaiduNetdiskDownload\\data\data\\train\image\\" Train_Unet(net, device, data_path, epochs=40, batch_size=1) Unet_train_txt.close() plot_picture('Unet_train.txt')