def test_gpu_memory_profiler_gluon(): enable_profiler(profile_filename='test_profiler.json', run=True, continuous_dump=True) profiler.set_state('run') model = nn.HybridSequential() model.add(nn.Dense(128, activation='tanh')) model.add(nn.Dropout(0.5)) model.add(nn.Dense(64, activation='tanh'), nn.Dense(32, in_units=64)) model.add(nn.Activation('relu')) model.initialize(ctx=mx.gpu()) model.hybridize() inputs = mx.sym.var('data') with mx.autograd.record(): out = model(mx.nd.zeros((16, 10), ctx=mx.gpu())) out.backward() mx.nd.waitall() profiler.set_state('stop') profiler.dump(True) # We are only checking for weight parameters here, also making sure that # there is no unknown entries in the memory profile. with open('gpu_memory_profile-pid_%d.csv' % (os.getpid()), mode='r') as csv_file: csv_reader = csv.DictReader(csv_file) for row in csv_reader: print(",".join(list(row.values()))) for scope in ['in_arg', 'arg_grad']: for key, nd in model.collect_params().items(): expected_arg_name = "%s:%s:" % (model.name, scope) + nd.name expected_arg_size = str(4 * np.prod(nd.shape)) csv_file.seek(0) entry_found = False for row in csv_reader: if row['Attribute Name'] == expected_arg_name: assert row['Requested Size'] == expected_arg_size, \ "requested size={} is not equal to the expected size={}" \ .format(row['Requested Size'], expected_arg_size) entry_found = True break assert entry_found, \ "Entry for attr_name={} has not been found" \ .format(expected_arg_name) # Make sure that there is no unknown allocation entry. csv_file.seek(0) for row in csv_reader: if row['Attribute Name'] == "<unk>:unknown" or \ row['Attribute Name'] == "<unk>:": assert False, "Unknown allocation entry has been encountered"
def test_continuous_profile_and_instant_marker(): file_name = 'test_continuous_profile_and_instant_marker.json' enable_profiler(file_name, True, True, True) python_domain = profiler.Domain('PythonDomain::test_continuous_profile') last_file_size = 0 for i in range(5): profiler.Marker(python_domain, "StartIteration-" + str(i)).mark('process') test_profile_event(False) test_profile_counter(False) profiler.dump(False) # File size should keep increasing new_file_size = os.path.getsize(file_name) assert new_file_size >= last_file_size last_file_size = new_file_size profiler.dump(False) debug_str = profiler.dumps() assert(len(debug_str) > 0) profiler.set_state('stop')
def test_continuous_profile_and_instant_marker(): enable_profiler(True, True, True) python_domain = profiler.Domain('PythonDomain::test_continuous_profile') last_file_size = 0 for i in range(5): profiler.Marker(python_domain, "StartIteration-" + str(i)).mark('process') print("{}...".format(i)) test_profile_event(False) test_profile_counter(False) profiler.dump(False) # File size should keep increasing new_file_size = os.path.getsize("test_profile.json") assert new_file_size >= last_file_size last_file_size = new_file_size profiler.dump(False) debug_str = profiler.dumps() assert(len(debug_str) > 0) print(debug_str) profiler.set_state('stop')
def test_aggregate_duplication(): file_name = 'test_aggregate_duplication.json' enable_profiler(profile_filename = file_name, run=True, continuous_dump=True, \ aggregate_stats=True) inp = mx.nd.zeros(shape=(100, 100)) y = mx.nd.sqrt(inp) inp = inp + 1 inp = inp + 1 mx.nd.waitall() profiler.dump(False) debug_str = profiler.dumps(format='json') target_dict = json.loads(debug_str) assert 'Time' in target_dict and 'operator' in target_dict['Time'] \ and 'sqrt' in target_dict['Time']['operator'] \ and 'Count' in target_dict['Time']['operator']['sqrt'] \ and '_plus_scalar' in target_dict['Time']['operator'] \ and 'Count' in target_dict['Time']['operator']['_plus_scalar'] # they are called once and twice respectively assert target_dict['Time']['operator']['sqrt']['Count'] == 1 assert target_dict['Time']['operator']['_plus_scalar']['Count'] == 2 profiler.set_state('stop')
def test_profiler(): iter_num = 5 begin_profiling_iter = 2 end_profiling_iter = 4 enable_profiler('test_profiler.json', False, False) A = mx.sym.Variable('A') B = mx.sym.Variable('B') C = mx.symbol.dot(A, B) executor = C.simple_bind(mx.cpu(1), 'write', A=(4096, 4096), B=(4096, 4096)) a = mx.random.uniform(-1.0, 1.0, shape=(4096, 4096)) b = mx.random.uniform(-1.0, 1.0, shape=(4096, 4096)) a.copyto(executor.arg_dict['A']) b.copyto(executor.arg_dict['B']) print("execution begin") for i in range(iter_num): print("Iteration {}/{}".format(i + 1, iter_num)) if i == begin_profiling_iter: t0 = time.clock() profiler.set_state('run') if i == end_profiling_iter: t1 = time.clock() profiler.set_state('stop') executor.forward() c = executor.outputs[0] c.wait_to_read() print("execution end") duration = t1 - t0 print('duration: {0}s'.format(duration)) print(' {0}ms/operator'.format(duration * 1000 / iter_num)) profiler.dump(True) profiler.set_state('stop')
def test_profiler(): iter_num = 5 begin_profiling_iter = 2 end_profiling_iter = 4 enable_profiler(False, False) A = mx.sym.Variable('A') B = mx.sym.Variable('B') C = mx.symbol.dot(A, B) executor = C.simple_bind(mx.cpu(1), 'write', A=(4096, 4096), B=(4096, 4096)) a = mx.random.uniform(-1.0, 1.0, shape=(4096, 4096)) b = mx.random.uniform(-1.0, 1.0, shape=(4096, 4096)) a.copyto(executor.arg_dict['A']) b.copyto(executor.arg_dict['B']) print("execution begin") for i in range(iter_num): print("Iteration {}/{}".format(i + 1, iter_num)) if i == begin_profiling_iter: t0 = time.clock() profiler.set_state('run') if i == end_profiling_iter: t1 = time.clock() profiler.set_state('stop') executor.forward() c = executor.outputs[0] c.wait_to_read() print("execution end") duration = t1 - t0 print('duration: {0}s'.format(duration)) print(' {0}ms/operator'.format(duration*1000/iter_num)) profiler.dump(True) profiler.set_state('stop')
def test_profiler(): iter_num = 5 begin_profiling_iter = 2 end_profiling_iter = 4 enable_profiler('test_profiler.json', False, False) A = mx.sym.Variable('A') B = mx.sym.Variable('B') C = mx.symbol.dot(A, B) executor = C._simple_bind(mx.cpu(1), 'write', A=(4096, 4096), B=(4096, 4096)) a = mx.random.uniform(-1.0, 1.0, shape=(4096, 4096)) b = mx.random.uniform(-1.0, 1.0, shape=(4096, 4096)) a.copyto(executor.arg_dict['A']) b.copyto(executor.arg_dict['B']) for i in range(iter_num): if i == begin_profiling_iter: t0 = time.process_time() profiler.set_state('run') if i == end_profiling_iter: t1 = time.process_time() profiler.set_state('stop') executor.forward() c = executor.outputs[0] c.wait_to_read() duration = t1 - t0 profiler.dump(True) profiler.set_state('stop')
def test_profile_create_domain_dept(): profiler.set_config(profile_symbolic=True, filename='test_profile_create_domain_dept.json') profiler.set_state('run') domain = profiler.Domain(name='PythonDomain') profiler.dump() profiler.set_state('stop')
def test_gpu_memory_profiler_symbolic(): enable_profiler('test_profiler.json') profiler.set_state('run') with profiler.scope("tensordot"): A = mx.sym.Variable('A') B = mx.sym.Variable('B') C = mx.symbol.dot(A, B, name='dot') executor = C._simple_bind(mx.gpu(), 'write', A=(1024, 2048), B=(2048, 4096)) with profiler.scope("init"): a = mx.random.uniform(-1.0, 1.0, shape=(1024, 2048)) b = mx.random.uniform(-1.0, 1.0, shape=(2048, 4096)) a.copyto(executor.arg_dict['A']) b.copyto(executor.arg_dict['B']) executor.forward() executor.backward() c = executor.outputs[0] mx.nd.waitall() profiler.set_state('stop') profiler.dump(True) expected_alloc_entries = [{ 'Attribute Name': 'tensordot:in_arg:A', 'Requested Size': str(4 * a.size) }, { 'Attribute Name': 'tensordot:in_arg:B', 'Requested Size': str(4 * b.size) }, { 'Attribute Name': 'tensordot:dot', 'Requested Size': str(4 * c.size) }, { 'Attribute Name': 'init:_random_uniform', 'Requested Size': str(4 * a.size) }, { 'Attribute Name': 'init:_random_uniform', 'Requested Size': str(4 * b.size) }] # Sample gpu_memory_profile.csv: # "Attribute Name","Requested Size","Device","Actual Size","Reuse?" # init:_random_uniform,33554432,0,33554432,1 # init:_random_uniform,8388608,0,8388608,1 # resource:temp_space (sample_op.h +365),8,0,4096,0 # symbol:arg_grad:unknown,8388608,0,8388608,0 # symbol:arg_grad:unknown,33554432,0,33554432,0 # tensordot:dot,16777216,0,16777216,0 # tensordot:dot_backward,33554432,0,33554432,0 # tensordot:dot_backward,8388608,0,8388608,0 # tensordot:dot_head_grad,16777216,0,16777216,0 # tensordot:in_arg:A,8388608,0,8388608,0 # tensordot:in_arg:B,33554432,0,33554432,0 with open('gpu_memory_profile-pid_%d.csv' % (os.getpid()), mode='r') as csv_file: csv_reader = csv.DictReader(csv_file) # TODO: Remove this print statement later on. for row in csv_reader: print(",".join(list(row.values()))) for expected_alloc_entry in expected_alloc_entries: csv_file.seek(0) entry_found = False for row in csv_reader: if row['Attribute Name'] == expected_alloc_entry['Attribute Name'] and \ row['Requested Size'] == expected_alloc_entry['Requested Size']: entry_found = True break assert entry_found, \ "Entry for (attr_name={}, alloc_size={}) has not been found" \ .format(expected_alloc_entry['Attribute Name'], expected_alloc_entry['Requested Size']) # Make sure that there is no unknown allocation entry. csv_file.seek(0) for row in csv_reader: if row['Attribute Name'] == "<unk>:unknown" or \ row['Attribute Name'] == "<unk>:": assert False, "Unknown allocation entry has been encountered"
def test_gpu_memory_profiler_gluon(): enable_profiler(profile_filename='test_profiler.json') profiler.set_state('run') model = nn.HybridSequential() model.add(nn.Dense(128, activation='tanh')) model.add(nn.Dropout(0.5)) model.add(nn.Dense(64, activation='tanh'), nn.Dense(32, in_units=64)) model.add(nn.Activation('relu')) model.initialize(ctx=mx.gpu()) model.hybridize() inputs = mx.sym.var('data') with mx.autograd.record(): out = model(mx.nd.zeros((16, 10), ctx=mx.gpu())) out.backward() mx.nd.waitall() profiler.set_state('stop') profiler.dump(True) # Sample gpu_memory_profile.csv: # "Attribute Name","Requested Size","Device","Actual Size","Reuse?" # <unk>:in_arg:data,640,0,4096,0 # hybridsequential:activation0:hybridsequential_activation0_fwd,2048,0,4096,0 # hybridsequential:activation0:hybridsequential_activation0_fwd_backward,8192,0,8192,0 # hybridsequential:activation0:hybridsequential_activation0_fwd_head_grad,2048,0,4096,0 # hybridsequential:dense0:activation0:hybridsequential_dense0_activation0_fwd,8192,0,8192,0 # hybridsequential:dense0:arg_grad:bias,512,0,4096,0 # hybridsequential:dense0:arg_grad:weight,5120,0,8192,0 # hybridsequential:dense0:hybridsequential_dense0_fwd,8192,0,8192,0 # hybridsequential:dense0:in_arg:bias,512,0,4096,0 # hybridsequential:dense0:in_arg:weight,5120,0,8192,0 # hybridsequential:dense1:activation0:hybridsequential_dense1_activation0_fwd,4096,0,4096,0 # hybridsequential:dense1:arg_grad:bias,256,0,4096,0 # hybridsequential:dense1:arg_grad:weight,32768,0,32768,0 # hybridsequential:dense1:hybridsequential_dense1_fwd,4096,0,4096,0 # hybridsequential:dense1:in_arg:bias,256,0,4096,0 # hybridsequential:dense1:in_arg:weight,32768,0,32768,0 # hybridsequential:dense2:arg_grad:bias,128,0,4096,0 # hybridsequential:dense2:arg_grad:weight,8192,0,8192,0 # hybridsequential:dense2:hybridsequential_dense2_fwd_backward,4096,0,4096,1 # hybridsequential:dense2:in_arg:bias,128,0,4096,0 # hybridsequential:dense2:in_arg:weight,8192,0,8192,0 # hybridsequential:dropout0:hybridsequential_dropout0_fwd,8192,0,8192,0 # hybridsequential:dropout0:hybridsequential_dropout0_fwd,8192,0,8192,0 # resource:cudnn_dropout_state (dropout-inl.h +256),1474560,0,1474560,0 # resource:temp_space (fully_connected-inl.h +316),15360,0,16384,0 # We are only checking for weight parameters here, also making sure that # there is no unknown entries in the memory profile. with open('gpu_memory_profile-pid_%d.csv' % (os.getpid()), mode='r') as csv_file: csv_reader = csv.DictReader(csv_file) # TODO: Remove this print statement later on. for row in csv_reader: print(",".join(list(row.values()))) for param in model.collect_params().values(): expected_arg_name = "%sin_arg:" % param.var().attr('__profiler_scope__') + \ param.name expected_arg_size = str(4 * np.prod(param.shape)) csv_file.seek(0) entry_found = False for row in csv_reader: if row['Attribute Name'] == expected_arg_name and \ row['Requested Size'] == expected_arg_size: entry_found = True break assert entry_found, \ "Entry for (attr_name={}, alloc_size={}) has not been found" \ .format(expected_arg_name, expected_arg_size) # Make sure that there is no unknown allocation entry. csv_file.seek(0) for row in csv_reader: if row['Attribute Name'] == "<unk>:unknown" or \ row['Attribute Name'] == "<unk>:": assert False, "Unknown allocation entry has been encountered"
def train( args, model, train_sampler, valid_samplers=None, rank=0, rel_parts=None, barrier=None, ): assert args.num_proc <= 1, "MXNet KGE does not support multi-process now" assert (args.rel_part == False ), "No need for relation partition in single process for MXNet KGE" logs = [] for arg in vars(args): logging.info("{:20}:{}".format(arg, getattr(args, arg))) if len(args.gpu) > 0: gpu_id = (args.gpu[rank % len(args.gpu)] if args.mix_cpu_gpu and args.num_proc > 1 else args.gpu[0]) else: gpu_id = -1 if args.strict_rel_part: model.prepare_relation(mx.gpu(gpu_id)) if mxprofiler: from mxnet import profiler profiler.set_config( profile_all=True, aggregate_stats=True, continuous_dump=True, filename="profile_output.json", ) start = time.time() for step in range(0, args.max_step): pos_g, neg_g = next(train_sampler) args.step = step if step == 1 and mxprofiler: profiler.set_state("run") with mx.autograd.record(): loss, log = model.forward(pos_g, neg_g, gpu_id) loss.backward() logs.append(log) model.update(gpu_id) if step % args.log_interval == 0: for k in logs[0].keys(): v = sum(l[k] for l in logs) / len(logs) print("[Train]({}/{}) average {}: {}".format( step, args.max_step, k, v)) logs = [] print(time.time() - start) start = time.time() if (args.valid and step % args.eval_interval == 0 and step > 1 and valid_samplers is not None): start = time.time() test(args, model, valid_samplers, mode="Valid") print("test:", time.time() - start) if args.strict_rel_part: model.writeback_relation(rank, rel_parts) if mxprofiler: nd.waitall() profiler.set_state("stop") profiler.dump() print(profiler.dumps()) # clear cache logs = []
def test_gpu_memory_profiler_gluon(): enable_profiler(profile_filename='test_profiler.json', run=True, continuous_dump=True) profiler.set_state('run') model = nn.HybridSequential(prefix='net_') with model.name_scope(): model.add(nn.Dense(128, activation='tanh')) model.add(nn.Dropout(0.5)) model.add(nn.Dense(64, activation='tanh'), nn.Dense(32, in_units=64)) model.add(nn.Activation('relu')) model.initialize(ctx=mx.gpu()) model.hybridize() inputs = mx.sym.var('data') with mx.autograd.record(): out = model(mx.nd.zeros((16, 10), ctx=mx.gpu())) out.backward() mx.nd.waitall() profiler.set_state('stop') profiler.dump(True) # Sample gpu_memory_profiler.csv # "Attribute Name","Requested Size","Device","Actual Size","Reuse?" # "<unk>:in_arg:data","640","0","4096","0" # "net:arg_grad:net_dense0_bias","512","0","4096","0" # "net:arg_grad:net_dense0_weight","5120","0","8192","0" # "net:arg_grad:net_dense1_bias","256","0","4096","0" # "net:arg_grad:net_dense1_weight","32768","0","32768","0" # "net:arg_grad:net_dense2_bias","128","0","4096","0" # "net:arg_grad:net_dense2_weight","8192","0","8192","0" # "net:dense0:net_dense0_fwd","8192","0","8192","0" # "net:dense0:tanh:net_dense0_tanh_fwd","8192","0","8192","0" # "net:dense1:net_dense1_fwd","4096","0","4096","0" # "net:dense1:tanh:net_dense1_tanh_fwd","4096","0","4096","0" # "net:dense2:net_dense2_fwd","2048","0","4096","0" # "net:dense2:net_dense2_fwd_backward","4096","0","4096","0" # "net:dropout0:net_dropout0_fwd","8192","0","8192","0" # "net:dropout0:net_dropout0_fwd","8192","0","8192","0" # "net:in_arg:net_dense0_bias","512","0","4096","0" # "net:in_arg:net_dense0_weight","5120","0","8192","0" # "net:in_arg:net_dense1_bias","256","0","4096","0" # "net:in_arg:net_dense1_weight","32768","0","32768","0" # "net:in_arg:net_dense2_bias","128","0","4096","0" # "net:in_arg:net_dense2_weight","8192","0","8192","0" # "net:relu0:net_relu0_fwd","2048","0","4096","0" # "net:relu0:net_relu0_fwd_backward","8192","0","8192","0" # "net:relu0:net_relu0_fwd_head_grad","2048","0","4096","0" # "resource:cudnn_dropout_state (dropout-inl.h +258)","1671168","0","1671168","0" # "resource:temp_space (fully_connected-inl.h +316)","34816","0","36864","0" # We are only checking for weight parameters here, also making sure that # there is no unknown entries in the memory profile. with open('gpu_memory_profile-pid_%d.csv' % (os.getpid()), mode='r') as csv_file: csv_reader = csv.DictReader(csv_file) for scope in ['in_arg', 'arg_grad']: for key, nd in model.collect_params().items(): expected_arg_name = "net:%s:" % scope + key expected_arg_size = str(4 * np.prod(nd.shape)) csv_file.seek(0) entry_found = False for row in csv_reader: if row['Attribute Name'] == expected_arg_name: assert row['Requested Size'] == expected_arg_size, \ "requested size={} is not equal to the expected size={}" \ .format(row['Requested Size'], expected_arg_size) entry_found = True break assert entry_found, \ "Entry for attr_name={} has not been found" \ .format(expected_arg_name) # Make sure that there is no unknown allocation entry. csv_file.seek(0) for row in csv_reader: if row['Attribute Name'] == "<unk>:unknown" or \ row['Attribute Name'] == "<unk>:": assert False, "Unknown allocation entry has been encountered"
def test_gpu_memory_profiler_symbolic(): iter_num = 5 enable_profiler('test_profiler.json', False, False) profiler.set_state('run') with profiler.Scope("tensordot"): A = mx.sym.Variable('A') B = mx.sym.Variable('B') C = mx.symbol.dot(A, B, name='dot') executor = C.simple_bind(mx.gpu(), 'write', A=(4096, 4096), B=(4096, 4096)) a = mx.random.uniform(-1.0, 1.0, shape=(4096, 4096)) b = mx.random.uniform(-1.0, 1.0, shape=(4096, 4096)) a.copyto(executor.arg_dict['A']) b.copyto(executor.arg_dict['B']) for i in range(iter_num): executor.forward() c = executor.outputs[0] mx.nd.waitall() profiler.set_state('stop') profiler.dump(True) expected_alloc_entries = [{ 'Attribute Name': 'tensordot:in_arg:A', 'Requested Size': str(4 * a.size) }, { 'Attribute Name': 'tensordot:in_arg:B', 'Requested Size': str(4 * b.size) }, { 'Attribute Name': 'tensordot:arg_grad:A', 'Requested Size': str(4 * a.size) }, { 'Attribute Name': 'tensordot:arg_grad:B', 'Requested Size': str(4 * b.size) }, { 'Attribute Name': 'tensordot:dot', 'Requested Size': str(4 * c.size) }, { 'Attribute Name': 'tensordot:dot_head_grad', 'Requested Size': str(4 * c.size) }] # Sample gpu_memory_profile.csv: # "Attribute Name","Requested Size","Device","Actual Size","Reuse?" # "tensordot:arg_grad:A","67108864","0","67108864","0" # "tensordot:arg_grad:B","67108864","0","67108864","0" # "tensordot:dot","67108864","0","67108864","0" # "tensordot:dot_head_grad","67108864","0","67108864","0" # "tensordot:in_arg:A","67108864","0","67108864","0" # "tensordot:in_arg:B","67108864","0","67108864","0" with open('gpu_memory_profile-pid_%d.csv' % (os.getpid()), mode='r') as csv_file: csv_reader = csv.DictReader(csv_file) for expected_alloc_entry in expected_alloc_entries: csv_file.seek(0) entry_found = False for row in csv_reader: if row['Attribute Name'] == expected_alloc_entry[ 'Attribute Name']: assert row['Requested Size'] == expected_alloc_entry['Requested Size'], \ "requested size={} is not equal to the expected size={}" \ .format(row['Requested Size'], expected_alloc_entry['Requested Size']) entry_found = True break assert entry_found, \ "Entry for attr_name={} has not been found" \ .format(expected_alloc_entry['Attribute Name'])
def custom_operator_profiling_multiple_custom_ops(seed, mode, file_name): class MyAdd(mx.operator.CustomOp): def forward(self, is_train, req, in_data, out_data, aux): self.assign(out_data[0], req[0], in_data[0] + 1) def backward(self, req, out_grad, in_data, out_data, in_grad, aux): self.assign(in_grad[0], req[0], out_grad[0]) @mx.operator.register('MyAdd1') class MyAdd1Prop(mx.operator.CustomOpProp): def __init__(self): super(MyAdd1Prop, self).__init__(need_top_grad=True) def list_arguments(self): return ['data'] def list_outputs(self): return ['output'] def infer_shape(self, in_shape): # inputs, outputs, aux return [in_shape[0]], [in_shape[0]], [] def create_operator(self, ctx, shapes, dtypes): return MyAdd() @mx.operator.register('MyAdd2') class MyAdd2Prop(mx.operator.CustomOpProp): def __init__(self): super(MyAdd2Prop, self).__init__(need_top_grad=True) def list_arguments(self): return ['data'] def list_outputs(self): return ['output'] def infer_shape(self, in_shape): # inputs, outputs, aux return [in_shape[0]], [in_shape[0]], [] def create_operator(self, ctx, shapes, dtypes): return MyAdd() enable_profiler(profile_filename=file_name, run=True, continuous_dump=True,\ aggregate_stats=True) # clear aggregate stats profiler.dumps(reset=True) inp = mx.nd.zeros(shape=(100, 100)) if mode == 'imperative': y = mx.nd.Custom(inp, op_type='MyAdd1') z = mx.nd.Custom(inp, op_type='MyAdd2') elif mode == 'symbolic': a = mx.symbol.Variable('a') b = mx.symbol.Custom(data=a, op_type='MyAdd1') c = mx.symbol.Custom(data=a, op_type='MyAdd2') y = b.bind(mx.cpu(), {'a': inp}) z = c.bind(mx.cpu(), {'a': inp}) yy = y.forward() zz = z.forward() mx.nd.waitall() profiler.dump(False) debug_str = profiler.dumps(format='json') check_custom_operator_profiling_multiple_custom_ops_output(debug_str) profiler.set_state('stop')
# print(key, aux_params[key]) param_size += aux_params[key].size * 4 print("Parameter size", param_size / 1024 / 1024, " MB") repeat_times = 10 profiler.set_state('run') # profiler.pause() # train 5 epochs, i.e. going over the data iter one pass start = time.time() for epoch in range(5): train_data.reset() metric.reset() for i, batch in enumerate(train_data): if i == 1: profiler.resume() mod.forward(batch, is_train=True) # compute predictions mod.update_metric(metric, batch.label) # accumulate prediction accuracy mod.backward() # compute gradients mod.update() # update parameters if i == repeat_times: # benchmark 100 iterations break # print('Epoch %d, Training %s' % (epoch, metric.get())) mx.nd.waitall() profiler.set_state('stop') profiler.dump() end = time.time() time_per_img = (end - start) * 1.0 / batch_size / repeat_times print("batch\tthreshold\tthread number\ttime per image\tmemory (GB)") print("%d\t%d\t%s\t%s\t%f" %(batch_size, threshold, os.environ["MXNET_CPU_WORKER_NTHREADS"], time_per_img, cpuStats()))
def main(): import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt opt = parse_args() batch_size = opt.batch_size classes = 10 num_gpus = opt.num_gpus batch_size *= max(1, num_gpus) context = [mx.gpu(i) for i in range(num_gpus)] if num_gpus > 0 else [mx.cpu()] num_workers = opt.num_workers lr_sch = lr_scheduler.CosineScheduler((50000//batch_size)*opt.num_epochs, base_lr=opt.lr, warmup_steps=5*(50000//batch_size), final_lr=1e-5) # lr_sch = lr_scheduler.FactorScheduler((50000//batch_size)*20, # factor=0.2, base_lr=opt.lr, # warmup_steps=5*(50000//batch_size)) # lr_sch = LRScheduler('cosine',opt.lr, niters=(50000//batch_size)*opt.num_epochs,) model_name = opt.model net = SKT_Lite() # if model_name.startswith('cifar_wideresnet'): # kwargs = {'classes': classes, # 'drop_rate': opt.drop_rate} # else: # kwargs = {'classes': classes} # net = get_model(model_name, **kwargs) if opt.mixup: model_name += '_mixup' if opt.amp: model_name += '_amp' makedirs('./'+model_name) os.chdir('./'+model_name) sw = SummaryWriter( logdir='.\\tb\\'+model_name, flush_secs=5, verbose=False) makedirs(opt.save_plot_dir) if opt.resume_from: net.load_parameters(opt.resume_from, ctx=context) optimizer = 'nag' save_period = opt.save_period if opt.save_dir and save_period: save_dir = opt.save_dir makedirs(save_dir) else: save_dir = '' save_period = 0 plot_name = opt.save_plot_dir logging_handlers = [logging.StreamHandler()] if opt.logging_dir: logging_dir = opt.logging_dir makedirs(logging_dir) logging_handlers.append(logging.FileHandler( '%s/train_cifar10_%s.log' % (logging_dir, model_name))) logging.basicConfig(level=logging.INFO, handlers=logging_handlers) logging.info(opt) if opt.amp: amp.init() if opt.profile_mode: profiler.set_config(profile_all=True, aggregate_stats=True, continuous_dump=True, filename='%s_profile.json' % model_name) transform_train = transforms.Compose([ gcv_transforms.RandomCrop(32, pad=4), CutOut(8), # gcv_transforms.block.RandomErasing(s_max=0.25), transforms.RandomFlipLeftRight(), # transforms.RandomFlipTopBottom(), transforms.Resize(32), transforms.ToTensor(), transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010]) ]) transform_test = transforms.Compose([ transforms.Resize(32), transforms.ToTensor(), transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010]) ]) def label_transform(label, classes): ind = label.astype('int') res = nd.zeros((ind.shape[0], classes), ctx=label.context) res[nd.arange(ind.shape[0], ctx=label.context), ind] = 1 return res def test(ctx, val_data): metric = mx.metric.Accuracy() loss_fn = gluon.loss.SoftmaxCrossEntropyLoss() num_batch = len(val_data) test_loss = 0 for i, batch in enumerate(val_data): data = gluon.utils.split_and_load( batch[0], ctx_list=ctx, batch_axis=0) label = gluon.utils.split_and_load( batch[1], ctx_list=ctx, batch_axis=0) outputs = [net(X) for X in data] loss = [loss_fn(yhat, y) for yhat, y in zip(outputs, label)] metric.update(label, outputs) test_loss += sum([l.sum().asscalar() for l in loss]) test_loss /= batch_size * num_batch name, val_acc = metric.get() return name, val_acc, test_loss def train(epochs, ctx): if isinstance(ctx, mx.Context): ctx = [ctx] net.initialize(mx.init.MSRAPrelu(), ctx=ctx) root = os.path.join('..', 'datasets', 'cifar-10') train_data = gluon.data.DataLoader( gluon.data.vision.CIFAR10( root=root, train=True).transform_first(transform_train), batch_size=batch_size, shuffle=True, last_batch='discard', num_workers=num_workers) val_data = gluon.data.DataLoader( gluon.data.vision.CIFAR10( root=root, train=False).transform_first(transform_test), batch_size=batch_size, shuffle=False, num_workers=num_workers) trainer = gluon.Trainer(net.collect_params(), optimizer, {'learning_rate': opt.lr, 'wd': opt.wd, 'momentum': opt.momentum, 'lr_scheduler': lr_sch}) if opt.amp: amp.init_trainer(trainer) metric = mx.metric.Accuracy() train_metric = mx.metric.RMSE() loss_fn = gluon.loss.SoftmaxCrossEntropyLoss( sparse_label=False if opt.mixup else True) train_history = TrainingHistory(['training-error', 'validation-error']) # acc_history = TrainingHistory(['training-acc', 'validation-acc']) loss_history = TrainingHistory(['training-loss', 'validation-loss']) iteration = 0 best_val_score = 0 for epoch in range(epochs): tic = time.time() train_metric.reset() metric.reset() train_loss = 0 num_batch = len(train_data) alpha = 1 for i, batch in enumerate(train_data): if epoch == 0 and iteration == 1 and opt.profile_mode: profiler.set_state('run') lam = np.random.beta(alpha, alpha) if epoch >= epochs - 20 or not opt.mixup: lam = 1 data_1 = gluon.utils.split_and_load( batch[0], ctx_list=ctx, batch_axis=0) label_1 = gluon.utils.split_and_load( batch[1], ctx_list=ctx, batch_axis=0) if not opt.mixup: data = data_1 label = label_1 else: data = [lam*X + (1-lam)*X[::-1] for X in data_1] label = [] for Y in label_1: y1 = label_transform(Y, classes) y2 = label_transform(Y[::-1], classes) label.append(lam*y1 + (1-lam)*y2) with ag.record(): output = [net(X) for X in data] loss = [loss_fn(yhat, y) for yhat, y in zip(output, label)] if opt.amp: with ag.record(): with amp.scale_loss(loss, trainer) as scaled_loss: ag.backward(scaled_loss) # scaled_loss.backward() else: for l in loss: l.backward() trainer.step(batch_size) train_loss += sum([l.sum().asscalar() for l in loss]) output_softmax = [nd.SoftmaxActivation(out) for out in output] train_metric.update(label, output_softmax) metric.update(label_1, output_softmax) name, acc = train_metric.get() sw.add_scalar(tag='lr', value=trainer.learning_rate, global_step=iteration) if epoch == 0 and iteration == 1 and opt.profile_mode: nd.waitall() profiler.set_state('stop') iteration += 1 train_loss /= batch_size * num_batch name, acc = train_metric.get() _, train_acc = metric.get() name, val_acc, _ = test(ctx, val_data) if opt.mixup: train_history.update([acc, 1-val_acc]) plt.cla() train_history.plot(save_path='%s/%s_history.png' % (plot_name, model_name)) else: train_history.update([1-train_acc, 1-val_acc]) plt.cla() train_history.plot(save_path='%s/%s_history.png' % (plot_name, model_name)) # acc_history.update([train_acc, val_acc]) # plt.cla() # acc_history.plot(save_path='%s/%s_acc.png' % # (plot_name, model_name), legend_loc='best') if val_acc > best_val_score: best_val_score = val_acc net.save_parameters('%s/%.4f-cifar-%s-%d-best.params' % (save_dir, best_val_score, model_name, epoch)) current_lr = trainer.learning_rate name, val_acc, val_loss = test(ctx, val_data) loss_history.update([train_loss, val_loss]) plt.cla() loss_history.plot(save_path='%s/%s_loss.png' % (plot_name, model_name), y_lim=(0, 2), legend_loc='best') logging.info('[Epoch %d] loss=%f train_acc=%f train_RMSE=%f\n val_acc=%f val_loss=%f lr=%f time: %f' % (epoch, train_loss, train_acc, acc, val_acc, val_loss, current_lr, time.time()-tic)) sw._add_scalars(tag='Acc', scalar_dict={'train_acc': train_acc, 'test_acc': val_acc}, global_step=epoch) sw._add_scalars(tag='Loss', scalar_dict={'train_loss': train_loss, 'test_loss': val_loss}, global_step=epoch) if save_period and save_dir and (epoch + 1) % save_period == 0: net.save_parameters('%s/cifar10-%s-%d.params' % (save_dir, model_name, epoch)) if save_period and save_dir: net.save_parameters('%s/cifar10-%s-%d.params' % (save_dir, model_name, epochs-1)) if opt.mode == 'hybrid': net.hybridize() train(opt.num_epochs, context) if opt.profile_mode: profiler.dump(finished=False) sw.close()
def train(epochs, ctx): if isinstance(ctx, mx.Context): ctx = [ctx] if config.train_cfg.param_init: init_func = getattr(mx.init, config.train_cfg.init) net.initialize(init_func(), ctx=ctx, force_reinit=True) else: net.load_parameters(config.train_cfg.param_file, ctx=ctx) summary(net, stat_name, nd.uniform( shape=(1, 3, imgsize, imgsize), ctx=ctx[0])) # net = nn.HybridBlock() net.hybridize() root = config.dir_cfg.dataset train_data = gluon.data.DataLoader( gluon.data.vision.CIFAR10( root=root, train=True).transform_first(transform_train), batch_size=batch_size, shuffle=True, last_batch='discard', num_workers=num_workers) val_data = gluon.data.DataLoader( gluon.data.vision.CIFAR10( root=root, train=False).transform_first(transform_test), batch_size=batch_size, shuffle=False, num_workers=num_workers) trainer_arg = {'learning_rate': config.lr_cfg.lr, 'wd': config.lr_cfg.wd, 'lr_scheduler': lr_sch} extra_arg = eval(config.lr_cfg.extra_arg) trainer_arg.update(extra_arg) trainer = gluon.Trainer(net.collect_params(), optimizer, trainer_arg) if config.train_cfg.amp: amp.init_trainer(trainer) metric = mx.metric.Accuracy() train_metric = mx.metric.RMSE() loss_fn = gluon.loss.SoftmaxCrossEntropyLoss( sparse_label=False if config.data_cfg.mixup else True) train_history = TrainingHistory(['training-error', 'validation-error']) # acc_history = TrainingHistory(['training-acc', 'validation-acc']) loss_history = TrainingHistory(['training-loss', 'validation-loss']) iteration = 0 best_val_score = 0 # print('start training') sig_state.emit(1) sig_pgbar.emit(0) # signal.emit('Training') for epoch in range(epochs): tic = time.time() train_metric.reset() metric.reset() train_loss = 0 num_batch = len(train_data) alpha = 1 for i, batch in enumerate(train_data): if epoch == 0 and iteration == 1 and config.save_cfg.profiler: profiler.set_state('run') is_profiler_run = True if epoch == 0 and iteration == 1 and config.save_cfg.tensorboard: sw.add_graph(net) lam = np.random.beta(alpha, alpha) if epoch >= epochs - 20 or not config.data_cfg.mixup: lam = 1 data_1 = gluon.utils.split_and_load( batch[0], ctx_list=ctx, batch_axis=0) label_1 = gluon.utils.split_and_load( batch[1], ctx_list=ctx, batch_axis=0) if not config.data_cfg.mixup: data = data_1 label = label_1 else: data = [lam*X + (1-lam)*X[::-1] for X in data_1] label = [] for Y in label_1: y1 = label_transform(Y, classes) y2 = label_transform(Y[::-1], classes) label.append(lam*y1 + (1-lam)*y2) with ag.record(): output = [net(X) for X in data] loss = [loss_fn(yhat, y) for yhat, y in zip(output, label)] if config.train_cfg.amp: with ag.record(): with amp.scale_loss(loss, trainer) as scaled_loss: ag.backward(scaled_loss) # scaled_loss.backward() else: for l in loss: l.backward() trainer.step(batch_size) train_loss += sum([l.sum().asscalar() for l in loss]) output_softmax = [nd.SoftmaxActivation(out) for out in output] train_metric.update(label, output_softmax) metric.update(label_1, output_softmax) name, acc = train_metric.get() if config.save_cfg.tensorboard: sw.add_scalar(tag='lr', value=trainer.learning_rate, global_step=iteration) if epoch == 0 and iteration == 1 and config.save_cfg.profiler: nd.waitall() profiler.set_state('stop') profiler.dump() iteration += 1 sig_pgbar.emit(iteration) if check_flag()[0]: sig_state.emit(2) while(check_flag()[0] or check_flag()[1]): if check_flag()[1]: print('stop') return else: time.sleep(5) print('pausing') epoch_time = time.time() - tic train_loss /= batch_size * num_batch name, acc = train_metric.get() _, train_acc = metric.get() name, val_acc, _ = test(ctx, val_data) # if config.data_cfg.mixup: # train_history.update([acc, 1-val_acc]) # plt.cla() # train_history.plot(save_path='%s/%s_history.png' % # (plot_name, model_name)) # else: train_history.update([1-train_acc, 1-val_acc]) plt.cla() train_history.plot(save_path='%s/%s_history.png' % (plot_name, model_name)) if val_acc > best_val_score: best_val_score = val_acc net.save_parameters('%s/%.4f-cifar-%s-%d-best.params' % (save_dir, best_val_score, model_name, epoch)) current_lr = trainer.learning_rate name, val_acc, val_loss = test(ctx, val_data) logging.info('[Epoch %d] loss=%f train_acc=%f train_RMSE=%f\n val_acc=%f val_loss=%f lr=%f time: %f' % (epoch, train_loss, train_acc, acc, val_acc, val_loss, current_lr, epoch_time)) loss_history.update([train_loss, val_loss]) plt.cla() loss_history.plot(save_path='%s/%s_loss.png' % (plot_name, model_name), y_lim=(0, 2), legend_loc='best') if config.save_cfg.tensorboard: sw._add_scalars(tag='Acc', scalar_dict={'train_acc': train_acc, 'test_acc': val_acc}, global_step=epoch) sw._add_scalars(tag='Loss', scalar_dict={'train_loss': train_loss, 'test_loss': val_loss}, global_step=epoch) sig_table.emit([epoch, train_loss, train_acc, val_loss, val_acc, current_lr, epoch_time]) csv_writer.writerow([epoch, train_loss, train_acc, val_loss, val_acc, current_lr, epoch_time]) csv_file.flush() if save_period and save_dir and (epoch + 1) % save_period == 0: net.save_parameters('%s/cifar10-%s-%d.params' % (save_dir, model_name, epoch)) if save_period and save_dir: net.save_parameters('%s/cifar10-%s-%d.params' % (save_dir, model_name, epochs-1))
def test_custom_operator_profiling_multiple_custom_ops_imperative(seed = None, \ mode = 'imperative', file_name = None): class MyAdd(mx.operator.CustomOp): def forward(self, is_train, req, in_data, out_data, aux): self.assign(out_data[0], req[0], in_data[0] + 1) def backward(self, req, out_grad, in_data, out_data, in_grad, aux): self.assign(in_grad[0], req[0], out_grad[0]) @mx.operator.register('MyAdd1') class MyAdd1Prop(mx.operator.CustomOpProp): def __init__(self): super(MyAdd1Prop, self).__init__(need_top_grad=True) def list_arguments(self): return ['data'] def list_outputs(self): return ['output'] def infer_shape(self, in_shape): # inputs, outputs, aux return [in_shape[0]], [in_shape[0]], [] def create_operator(self, ctx, shapes, dtypes): return MyAdd() @mx.operator.register('MyAdd2') class MyAdd2Prop(mx.operator.CustomOpProp): def __init__(self): super(MyAdd2Prop, self).__init__(need_top_grad=True) def list_arguments(self): return ['data'] def list_outputs(self): return ['output'] def infer_shape(self, in_shape): # inputs, outputs, aux return [in_shape[0]], [in_shape[0]], [] def create_operator(self, ctx, shapes, dtypes): return MyAdd() if file_name is None: file_name = 'test_custom_operator_profiling_multiple_custom_ops_imperative.json' enable_profiler(profile_filename = file_name, run=True, continuous_dump=True,\ aggregate_stats=True) inp = mx.nd.zeros(shape=(100, 100)) if mode == 'imperative': x = inp + 1 y = mx.nd.Custom(inp, op_type='MyAdd1') z = mx.nd.Custom(inp, op_type='MyAdd2') elif mode == 'symbolic': a = mx.symbol.Variable('a') b = a + 1 c = mx.symbol.Custom(data=a, op_type='MyAdd1') d = mx.symbol.Custom(data=a, op_type='MyAdd2') b.bind(mx.cpu(), {'a': inp}).forward() c.bind(mx.cpu(), {'a': inp}).forward() d.bind(mx.cpu(), {'a': inp}).forward() mx.nd.waitall() profiler.dump(False) debug_str = profiler.dumps(format='json') target_dict = json.loads(debug_str) ''' We are calling _plus_scalar within MyAdd1 and MyAdd2 and outside both the custom operators, so in aggregate stats we should have three different kinds of _plus_scalar under domains "Custom Operator" and "operator" ''' assert 'Time' in target_dict and 'Custom Operator' in target_dict['Time'] \ and 'MyAdd1::pure_python' in target_dict['Time']['Custom Operator'] \ and 'MyAdd2::pure_python' in target_dict['Time']['Custom Operator'] \ and 'MyAdd1::_plus_scalar' in target_dict['Time']['Custom Operator'] \ and 'MyAdd2::_plus_scalar' in target_dict['Time']['Custom Operator'] \ and '_plus_scalar' not in target_dict['Time']['Custom Operator'] \ and 'operator' in target_dict['Time'] \ and '_plus_scalar' in target_dict['Time']['operator'] profiler.set_state('stop')
def main(): data_p = Path('/storage/data/').resolve() checkpoint_p = Path('./checkpoints/').resolve() checkpoint_p.mkdir(parents=True, exist_ok=True) logs_p = Path('./logs/').resolve() shutil.rmtree(logs_p, ignore_errors=True) encoder = SevenPlaneEncoder((19, 19)) builder = SGFDatasetBuilder(data_p, encoder=encoder) builder.download_and_prepare() train_itr = builder.train_dataset(batch_size=BATCH_SIZE, max_worker=cpu_count(), factor=FACTOR) test_itr = builder.test_dataset(batch_size=BATCH_SIZE, max_worker=cpu_count(), factor=FACTOR) # build model betago = Model() # convert to half-presicion floating point FP16 # NOTE: all NVIDIA GPUs with compute capability 6.1 have a low-rate FP16 performance == FFP16 is not the fast path on these GPUs # data passed to split_and_load() must be float16 too #betago.cast('float16') # hybridize for speed betago.hybridize(static_alloc=True, static_shape=True) # print graph shape = (1, ) + encoder.shape() mx.viz.print_summary(betago(mx.sym.var('data')), shape={'data': shape}) # pin GPUs ctx = [mx.gpu(i) for i in range(GPU_COUNT)] # optimizer opt_params = { 'learning_rate': 0.001, 'beta1': 0.9, 'beta2': 0.999, 'epsilon': 1e-08 } opt = mx.optimizer.create('adam', **opt_params) # initialize parameters # MXNet initializes the weight matrices uniformly by drawing from [−0.07,0.07], bias parameters are all set to 0 # 'Xavier': initializer is designed to keep the scale of gradients roughly the same in all layers betago.initialize(mx.init.Xavier(magnitude=2.3), ctx=ctx, force_reinit=True) # fetch and broadcast parameters params = betago.collect_params() # trainer trainer = Trainer(params=params, optimizer=opt, kvstore='device') # loss function loss_fn = SoftmaxCrossEntropyLoss() # use accuracy as the evaluation metric metric = Accuracy() with mxb.SummaryWriter(logdir='./logs') as sw: # add graph to MXBoard #betago.forward(mx.nd.ones(shape, ctx=ctx[0])) #betago.forward(mx.nd.ones(shape, ctx=ctx[1])) #sw.add_graph(betago) profiler.set_config(profile_all=True, aggregate_stats=True, continuous_dump=True, filename='profile_output.json') start = time.perf_counter() # train for e in range(EPOCHS): if 0 == e: profiler.set_state('run') tick = time.time() # reset the train data iterator. train_itr.reset() # loop over the train data iterator for i, batch in enumerate(train_itr): if 0 == i: tick_0 = time.time() # splits train data into multiple slices along batch_axis # copy each slice into a context data = split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0, even_split=False) # splits train label into multiple slices along batch_axis # copy each slice into a context label = split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0, even_split=False) outputs = [] losses = [] # inside training scope with ag.record(): for x, y in zip(data, label): z = betago(x) # computes softmax cross entropy loss l = loss_fn(z, y) outputs.append(z) losses.append(l) # backpropagate the error for one iteration for l in losses: l.backward() # make one step of parameter update. # trainer needs to know the batch size of data # to normalize the gradient by 1/batch_size trainer.step(BATCH_SIZE) # updates internal evaluation metric.update(label, outputs) # Print batch metrics if 0 == i % PRINT_N and 0 < i: # checkpointing betago.save_parameters( str(checkpoint_p.joinpath( 'betago-{}.params'.format(e)))) sw.add_scalar(tag='Accuracy', value={'naive': metric.get()[1]}, global_step=i - PRINT_N) sw.add_scalar(tag='Speed', value={ 'naive': BATCH_SIZE * (PRINT_N) / (time.time() - tick) }, global_step=i - PRINT_N) print( 'epoch[{}] batch [{}], accuracy {:.4f}, samples/sec: {:.4f}' .format(e, i, metric.get()[1], BATCH_SIZE * (PRINT_N) / (time.time() - tick))) tick = time.time() if 0 == e: profiler.set_state('stop') profiler.dump() # gets the evaluation result print('epoch [{}], accuracy {:.4f}, samples/sec: {:.4f}'.format( e, metric.get()[1], BATCH_SIZE * (i + 1) / (time.time() - tick_0))) # reset evaluation result to initial state metric.reset() elapsed = time.perf_counter() - start print('elapsed: {:0.3f}'.format(elapsed)) # use Accuracy as the evaluation metric metric = Accuracy() for batch in test_itr: data = split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0) label = split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0) outputs = [] for x in data: outputs.append(betago(x)) metric.update(label, outputs) print('validation %s=%f' % metric.get())
def _save_profile(self): if self._profile: print(profiler.dumps()) profiler.dump()