class TestMLP_network(): def __init__(self, mlp_network): self.net = mlp_network #MLP_network(18,32,64,32,2) self.map_strategy = MapStrategyEyeriss self.resource = Resource( proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.PROC), data_regions=(NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DATA), ), dim_array=PhyDim2(16, 16), size_gbuf=128 * 1024 // 2, # 128 kB size_regf=512 // 2, # 512 B ) self.cost = Cost(mac_op=1, mem_hier=(200, 6, 2, 1), noc_hop=0, unit_static=0) self.options = Option() def test_eyeriss_isca16(self): network = self.net batch_size = 16 nnd = NNDataflow(network, batch_size, self.resource, self.cost, self.map_strategy) tops, cache_stats = nnd.schedule_search(self.options) if not tops: sys.stderr.write("No valid dataflow found!") return None dfsch = tops[0] ## Write results. res_map = OrderedDict() res_map['net'] = "MLP_L" res_map['batch'] = batch_size res_map['resource'] = self.resource._asdict() res_map['cost'] = self.cost._asdict() res_map['options'] = self.options._asdict() res_map['cache_stats'] = cache_stats stats = stats_dict(dfsch, self.cost) for key, val in stats.items(): res_map[key] = val return res_map
def do_scheduling(args): ''' Get optimal scheduling for given problem. Return a result schedule. ''' ## Network. network = import_network(args.net) batch_size = args.batch ## Resource. dim_nodes = PhyDim2(*args.nodes) dim_array = PhyDim2(*args.array) # Sizes of gbuf and regf are in words. word = (args.word + 7) / 8 size_gbuf = args.gbuf / word size_regf = args.regf / word array_bus_width = args.bus_width // args.word if not array_bus_width: array_bus_width = float('inf') dram_bandwidth = args.dram_bw / word proc_region = NodeRegion(dim=dim_nodes, origin=PhyDim2(0, 0), type=NodeRegion.PROC) if args.mem_type == '2D': # Memory nodes are on two sides. data_region = NodeRegion(dim=PhyDim2(2, 2), origin=PhyDim2(0, 0), dist=dim_nodes - PhyDim2(1, 1), type=NodeRegion.DRAM) assert data_region.rel2abs(PhyDim2(1, 1)) + PhyDim2(1, 1) \ == proc_region.dim elif args.mem_type == '3D': # Memory nodes are on the top. data_region = NodeRegion(dim=dim_nodes, origin=PhyDim2(0, 0), type=NodeRegion.DRAM) resource = Resource(proc_region=proc_region, dram_region=data_region, src_data_region=data_region, dst_data_region=data_region, dim_array=dim_array, size_gbuf=size_gbuf, size_regf=size_regf, array_bus_width=array_bus_width, dram_bandwidth=dram_bandwidth, no_time_mux=False) ## Cost. hier_cost = [0] * me.NUM hier_cost[me.DRAM] = args.hier_cost[0] hier_cost[me.GBUF] = args.hier_cost[1] hier_cost[me.ITCN] = args.hier_cost[2] hier_cost[me.REGF] = args.hier_cost[3] cost = Cost(mac_op=args.op_cost, mem_hier=tuple(hier_cost), noc_hop=args.hop_cost, idl_unit=args.unit_idle_cost) ## Options. bypass = [True] * de.NUM bypass[de.IFM] = 'i' not in args.disable_bypass bypass[de.OFM] = 'o' not in args.disable_bypass bypass[de.FIL] = 'f' not in args.disable_bypass options = Option( sw_gbuf_bypass=tuple(bypass), sw_solve_loopblocking=args.solve_loopblocking, hw_access_forwarding=args.enable_access_forwarding, hw_gbuf_sharing=args.enable_gbuf_sharing, hw_gbuf_save_writeback=args.enable_save_writeback, partition_hybrid=args.hybrid_partition, partition_batch=args.batch_partition, partition_ifmaps=args.ifmaps_partition, partition_interlayer=args.interlayer_partition, layer_pipeline_time_ovhd=args.layer_pipeline_time_overhead, layer_pipeline_max_degree=args.layer_pipeline_max_degree, layer_pipeline_opt=not args.disable_interlayer_opt, opt_goal=args.goal.lower(), ntops=args.top, nprocesses=args.processes, verbose=args.verbose) ## Search schedules. nnd = NNDataflow(network, batch_size, resource, cost, MapStrategyEyeriss) tbeg = time.time() tops, cache_stats = nnd.schedule_search(options) tend = time.time() telapsed = tend - tbeg if not tops: sys.stderr.write('No valid dataflow found.\n') return None top = tops[0] ## Write results. res_map = OrderedDict() res_map['version'] = get_version(with_local=True) res_map['net'] = args.net res_map['batch'] = args.batch res_map['resource'] = resource._asdict() res_map['cost'] = cost._asdict() res_map['options'] = options._asdict() res_map['cache_stats'] = cache_stats res_map['elapsed'] = telapsed stats = stats_dict(top, cost) for key, val in stats.items(): res_map[key] = val return res_map
class TestNNDataflow(unittest.TestCase): ''' Tests for NNDataflow module. ''' def setUp(self): self.alex_net = import_network('alex_net') self.vgg_net = import_network('vgg_net') net = Network('simple') net.set_input_layer(InputLayer(4, 2)) net.add('1', ConvLayer(4, 4, 2, 1)) net.add('2', ConvLayer(4, 4, 2, 1)) # Two more layers to avoid single-segment case. net.add('a1', ConvLayer(4, 1, 1, 1, strd=2)) net.add('a2', ConvLayer(1, 1, 1, 1)) self.simple_net = net net = Network('complex') net.set_input_layer(InputLayer(8, 8)) net.add('1', ConvLayer(8, 8, 8, 1)) net.add('2a', ConvLayer(8, 8, 8, 1), prevs=('1',)) net.add('3a', ConvLayer(8, 8, 8, 1)) net.add('2b', ConvLayer(8, 8, 8, 1), prevs=('1',)) net.add('3b', ConvLayer(8, 8, 8, 1)) net.add('4', ConvLayer(16, 8, 8, 1), prevs=('3a', '3b')) self.complex_net = net self.map_strategy = MapStrategyEyeriss self.resource = Resource(proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.PROC), dram_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), src_data_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), dst_data_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), dim_array=PhyDim2(16, 16), size_gbuf=128 * 1024 // 2, # 128 kB size_regf=512 // 2, # 512 B array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=False, ) self.cost = Cost(mac_op=1, mem_hier=(200, 6, 2, 1), noc_hop=0, idl_unit=0) self.options = Option() def test_invalid_network(self): ''' Invalid network argument. ''' with self.assertRaisesRegex(TypeError, 'NNDataflow: .*network.*'): _ = NNDataflow(self.alex_net.input_layer(), 4, self.resource, self.cost, self.map_strategy) def test_invalid_resource(self): ''' Invalid network argument. ''' with self.assertRaisesRegex(TypeError, 'NNDataflow: .*resource.*'): _ = NNDataflow(self.alex_net, 4, self.resource.proc_region, self.cost, self.map_strategy) def test_invalid_cost(self): ''' Invalid network argument. ''' with self.assertRaisesRegex(TypeError, 'NNDataflow: .*cost.*'): _ = NNDataflow(self.alex_net, 4, self.resource, self.cost._asdict(), self.map_strategy) def test_invalid_map_strategy(self): ''' Invalid map_strategy argument. ''' class _DummyClass(): # pylint: disable=too-few-public-methods pass with self.assertRaisesRegex(TypeError, 'NNDataflow: .*map_strategy.*'): _ = NNDataflow(self.alex_net, 4, self.resource, self.cost, _DummyClass) def test_verbose(self): ''' Verbose mode. ''' network = self.alex_net batch_size = 16 options = Option(sw_gbuf_bypass=(True, True, True), sw_solve_loopblocking=True, verbose=True) nnd = NNDataflow(network, batch_size, self.resource, self.cost, self.map_strategy) old_stdout = sys.stdout old_stderr = sys.stderr sys.stdout = stdout = StringIO() sys.stderr = stderr = StringIO() tops, _ = nnd.schedule_search(options) sys.stdout = old_stdout sys.stderr = old_stderr stdout_value = stdout.getvalue() stderr_value = stderr.getvalue() stdout.close() stderr.close() self.assertTrue(tops) self.assertFalse(stdout_value) for layer in network: self.assertIn(layer, stderr_value) def test_pipelining(self): ''' Pipelining. ''' network = self.alex_net batch_size = 1 options = Option(hw_gbuf_save_writeback=True, partition_interlayer=True) nnd = NNDataflow(network, batch_size, self.resource, self.cost, self.map_strategy) tops, _ = nnd.schedule_search(options) self.assertTrue(tops) def test_fast_forward_infeasible(self): ''' Enter fast forward due to infeasible constraint. ''' network = self.simple_net batch_size = 1 # Very small gbuf size. Small fmap tpart is infeasible. resource = self.resource._replace( dim_array=PhyDim2(2, 2), size_gbuf=16) options = Option(hw_gbuf_save_writeback=True, partition_interlayer=True) nnd = NNDataflow(network, batch_size, resource, self.cost, self.map_strategy) tops, _ = nnd.schedule_search(options) self.assertTrue(tops) # No pipelining is feasible. for dtfl in tops: self.assertTupleEqual(dtfl['1'].sched_seq, (0, 0, 0)) self.assertTupleEqual(dtfl['2'].sched_seq, (1, 0, 0)) def test_fast_forward_found(self): ''' Enter fast forward due to early found. ''' network = self.simple_net batch_size = 1 # No time overhead limit. options = Option(hw_gbuf_save_writeback=True, partition_interlayer=True, layer_pipeline_time_ovhd=float('inf')) nnd = NNDataflow(network, batch_size, self.resource, self.cost, self.map_strategy) tops, _ = nnd.schedule_search(options) self.assertTrue(tops) def test_fast_forward_crit_time(self): ''' Enter fast forward due to long critical time. ''' network = self.simple_net batch_size = 1 # Multiple nodes for spatial pipelining. resource = self.resource._replace( proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(8, 8), type=NodeRegion.PROC), dim_array=PhyDim2(1, 1), ) # Very strict time overhead limit. # At large fmap tpart, utilization decreases and critical time would # increase. options = Option(hw_gbuf_save_writeback=True, partition_interlayer=True, layer_pipeline_time_ovhd=1e-3) nnd = NNDataflow(network, batch_size, resource, self.cost, self.map_strategy) tops, _ = nnd.schedule_search(options) self.assertTrue(tops) def test_fast_forward_frontier(self): ''' Enter fast forward due to off-frontier. ''' network = self.simple_net batch_size = 16 # Multiple nodes for spatial pipelining. resource = self.resource._replace( proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(8, 8), type=NodeRegion.PROC), dim_array=PhyDim2(2, 2), ) # No time overhead limit. options = Option(hw_gbuf_save_writeback=True, partition_interlayer=True, layer_pipeline_time_ovhd=float('inf')) nnd = NNDataflow(network, batch_size, resource, self.cost, self.map_strategy) tops, _ = nnd.schedule_search(options) self.assertTrue(tops) def test_fmap_fwd(self): ''' Fmap forward with shared mem sources or both on/off-chip destinations. ''' network = self.complex_net batch_size = 16 # Multiple nodes for spatial pipelining. resource = self.resource._replace( proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(8, 8), type=NodeRegion.PROC), ) # No time overhead limit. options = Option(hw_gbuf_save_writeback=True, partition_interlayer=True, layer_pipeline_time_ovhd=float('inf')) nnd = NNDataflow(network, batch_size, resource, self.cost, self.map_strategy) tops, _ = nnd.schedule_search(options) self.assertTrue(tops) def test_sched_instance_sharing(self): ''' Scheduling instance sharing between layers. ''' network = self.alex_net batch_size = 1 nnd = NNDataflow(network, batch_size, self.resource, self.cost, self.map_strategy) self.assertIs(nnd.layer_sched_dict['conv1_a'], nnd.layer_sched_dict['conv1_b']) self.assertIs(nnd.layer_sched_dict['conv2_a'], nnd.layer_sched_dict['conv2_b']) self.assertIs(nnd.layer_sched_dict['pool1_a'], nnd.layer_sched_dict['pool1_b']) def test_opt_goal(self): ''' Optimization goal. ''' network = self.alex_net batch_size = 8 resource = self.resource._replace( proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(8, 8), type=NodeRegion.PROC) ) nnd = NNDataflow(network, batch_size, resource, self.cost, self.map_strategy) options_e = Option(sw_gbuf_bypass=(True, True, True), sw_solve_loopblocking=True, partition_hybrid=True, partition_batch=True, opt_goal='e', ntops=16) tops_e, _ = nnd.schedule_search(options_e) self.assertTrue(tops_e) options_d = Option(sw_gbuf_bypass=(True, True, True), sw_solve_loopblocking=True, partition_hybrid=True, partition_batch=True, opt_goal='d', ntops=16) tops_d, _ = nnd.schedule_search(options_d) self.assertTrue(tops_d) options_ed = Option(sw_gbuf_bypass=(True, True, True), sw_solve_loopblocking=True, partition_hybrid=True, partition_batch=True, opt_goal='ed', ntops=16) tops_ed, _ = nnd.schedule_search(options_ed) self.assertTrue(tops_ed) self.assertLess(tops_e[0].total_cost, tops_d[0].total_cost) self.assertLess(tops_e[0].total_cost, tops_ed[0].total_cost) self.assertLess(tops_d[0].total_time, tops_e[0].total_time) self.assertLess(tops_d[0].total_time, tops_ed[0].total_time) # Sum of the smallest ED may not be the smallest; allow for error. self.assertLess(tops_ed[0].total_cost * tops_ed[0].total_time, tops_e[0].total_cost * tops_e[0].total_time * 1.05) self.assertLess(tops_ed[0].total_cost * tops_ed[0].total_time, tops_d[0].total_cost * tops_d[0].total_time * 1.05) def test_ext_layer(self): ''' With external layers. ''' network = self.alex_net network.add_ext('e0', InputLayer(4, 1)) network.add('l1', FCLayer(1000, 4)) network.add('l2', FCLayer(8, 4), prevs=('e0', 'l1')) batch_size = 16 options = Option(sw_gbuf_bypass=(True, True, True), sw_solve_loopblocking=True) nnd = NNDataflow(network, batch_size, self.resource, self.cost, self.map_strategy) tops, _ = nnd.schedule_search(options) self.assertTrue(tops) def test_no_valid_dataflow(self): ''' No valid dataflow is found. ''' # Very small REGF. self.resource = Resource(proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.PROC), dram_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), src_data_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.DRAM), dst_data_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.DRAM), dim_array=PhyDim2(16, 16), size_gbuf=128 * 1024 // 2, # 128 kB size_regf=2, array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=False, ) nnd = NNDataflow(self.alex_net, 4, self.resource, self.cost, self.map_strategy) tops, _ = nnd.schedule_search(self.options) self.assertFalse(tops) # With inter-layer pipelining. options = Option(hw_gbuf_save_writeback=True, partition_interlayer=True) tops, _ = nnd.schedule_search(options) self.assertFalse(tops) def test_scheduling_failure(self): ''' Layer scheduling failure. ''' network = self.alex_net batch_size = 16 nnd = NNDataflow(network, batch_size, self.resource, self.cost, MapStrategy) old_stdout = sys.stdout old_stderr = sys.stderr sys.stdout = stdout = StringIO() sys.stderr = stderr = StringIO() with self.assertRaises(NotImplementedError): _ = nnd.schedule_search(self.options) sys.stdout = old_stdout sys.stderr = old_stderr stdout_value = stdout.getvalue() stderr_value = stderr.getvalue() stdout.close() stderr.close() self.assertFalse(stdout_value) self.assertIn('Failed', stderr_value) def test_eyeriss_isca16(self): ''' Reproduce Eyeriss ISCA'16 paper Fig. 10. ''' network = self.alex_net batch_size = 16 nnd = NNDataflow(network, batch_size, self.resource, self.cost, self.map_strategy) tops, _ = nnd.schedule_search(self.options) self.assertTrue(tops) dfsch = tops[0] ## Check results. # Results as cost for each component: header = 'ALU, DRAM, Buffer, Array, RF' cost_bkdn = {} for layer in ['conv{}'.format(i) for i in range(1, 6)] \ + ['fc{}'.format(i) for i in range(1, 4)]: op_cost = 0 access_cost = [0] * me.NUM for layer_part in network: if not layer_part or not layer_part.startswith(layer): continue sr = dfsch[layer_part] op_cost += sr.total_ops * self.cost.mac_op access_cost = [ac + a * c for ac, a, c in zip(access_cost, sr.total_accesses, self.cost.mem_hier)] cost_bkdn[layer] = [] # To 1e9. cost_bkdn[layer].append(op_cost / 1e9) cost_bkdn[layer].append(access_cost[me.DRAM] / 1e9) cost_bkdn[layer].append(access_cost[me.GBUF] / 1e9) cost_bkdn[layer].append(access_cost[me.ITCN] / 1e9) cost_bkdn[layer].append(access_cost[me.REGF] / 1e9) # Check the major parts: ALU, DRAM, RF. major_cost_bkdn_ref = {'conv1': [1.69, 2.46, 6.75], 'conv2': [3.58, 2.27, 14.33], 'conv3': [2.39, 2.02, 9.57], 'conv4': [1.79, 1.57, 7.18], 'conv5': [1.20, 1.05, 4.78], 'fc1': [0.60, 7.78, 2.42], 'fc2': [0.27, 3.39, 1.07], 'fc3': [0.07, 0.84, 0.26], } for layer in cost_bkdn: success = all(abs(a - b) < 0.1 for a, b in zip(cost_bkdn[layer][:2] + cost_bkdn[layer][-1:], major_cost_bkdn_ref[layer])) self.assertTrue(success, 'test_eyeriss_isca16: ' 'ALU, DRAM, RF cost diff in layer {}.\n' 'header: {}\n' 'actual: {}\nref: {}' .format(layer, header, cost_bkdn[layer], major_cost_bkdn_ref[layer])) def test_eyeriss_isscc16(self): ''' Reproduce Eyeriss ISSCC'16 paper Fig. 14.5.6, JSSC'17 paper Table V. ''' network = self.alex_net batch_size = 4 resource = Resource(proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.PROC), dram_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), src_data_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), dst_data_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), dim_array=PhyDim2(12, 14), size_gbuf=108 * 1024 // 2, # 108 kB size_regf=261, # 225 + 12 + 24 array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=False, ) cost = Cost(mac_op=2e-12, mem_hier=(460e-12, 15e-12, 4e-12, 1e-12), # pJ/16-b noc_hop=0, idl_unit=30e-3 / 200e6) # 30 mW GBUF + REGF nnd = NNDataflow(network, batch_size, resource, cost, self.map_strategy) tops, _ = nnd.schedule_search(self.options) self.assertTrue(tops) dfsch = tops[0] ## Check results. # Results as stats of the rows in the table. header = 'Power, Processing Latency, Ops, Active PEs, Filter size' stats = {} for layer in ['conv{}'.format(i) for i in range(1, 6)]: onchip_cost = 0 time = 0 ops = 0 fil_size = 0 for layer_part in network: if not layer_part or not layer_part.startswith(layer): continue sr = dfsch[layer_part] onchip_cost += sr.total_cost \ - sr.total_accesses[me.DRAM] * cost.mem_hier[me.DRAM] time += sr.total_time ops += sr.total_ops fil_size += network[layer_part].total_filter_size() power = onchip_cost / (time / 200e6) * 1e3 # mW active_pes = int(ops / time) stats[layer] = [] stats[layer].append(power) stats[layer].append(time / 200.e3) # cycles to ms stats[layer].append(ops / 1e6) # to MOPs stats[layer].append(active_pes) stats[layer].append(fil_size / 1e3) # to k # Check. stats_ref = {'conv1': [332, 16.5, 421.66, 151, 34.8], # Act PE 154 'conv2': [288, 39.2, 895.79, 135, 307.2], 'conv3': [266, 21.8, 598.1, 156, 884.7], 'conv4': [235, 16.0, 448.6, 156, 663.6], 'conv5': [236, 10.0, 299.0, 156, 442.4], } for layer in stats: success = (0.6 * stats_ref[layer][0] < stats[layer][0] < stats_ref[layer][0]) \ and (0.8 * stats_ref[layer][1] < stats[layer][1] < stats_ref[layer][1]) \ and all(abs(a - b) < 0.1 for a, b in zip(stats[layer][2:], stats_ref[layer][2:])) self.assertTrue(success, 'test_eyeriss_isscc16: ' 'stats diff in layer {}.\n' 'header: {}\n' 'actual: {}\nref: {}' .format(layer, header, stats[layer], stats_ref[layer])) def test_eyeriss_asplos17(self): ''' Reproduce TETRIS ASPLOS'17 paper Figure 8. ''' network = self.alex_net batch_size = 16 ## L-1 configuration. resource = Resource(proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.PROC), dram_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), src_data_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), dst_data_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), dim_array=PhyDim2(16, 16), size_gbuf=576056 // 2, # 576 kB size_regf=1024 // 2, # 1 kB array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=False, ) cost = Cost(mac_op=2e-12, mem_hier=(240e-12, 28e-12, 4e-12, 1e-12), # pJ/16-b noc_hop=0, idl_unit=320e-12) nnd = NNDataflow(network, batch_size, resource, cost, self.map_strategy) tops, _ = nnd.schedule_search(self.options) self.assertTrue(tops) dfsch_l1 = tops[0] ## T-16 configuration. resource = Resource(proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.PROC), dram_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.DRAM), src_data_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.DRAM), dst_data_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.DRAM), dim_array=PhyDim2(14, 14), size_gbuf=133032 // 2, # 133 kB size_regf=512 // 2, # 512 B array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=False, ) cost = Cost(mac_op=2e-12, mem_hier=(80e-12, 14e-12, 4e-12, 0.6e-12), # pJ/16-b noc_hop=40e-12, idl_unit=200e-12) options = Option(sw_gbuf_bypass=(True, True, True), sw_solve_loopblocking=True, partition_hybrid=True) nnd = NNDataflow(network, batch_size, resource, cost, self.map_strategy) tops, _ = nnd.schedule_search(options) self.assertTrue(tops) dfsch_t16 = tops[0] ## Check results. # Same workload. self.assertAlmostEqual(dfsch_t16.total_ops, dfsch_l1.total_ops) # Performance of T-16 is proportional to PE resource (20% margin). self.assertLess(dfsch_t16.total_time, 1.2 * dfsch_l1.total_time * (16 * 16) / (14 * 14 * 16)) # Energy reduced by > 30%. # self.assertLess(dfsch_t16.total_cost, dfsch_l1.total_cost * 0.7) # With dimension restriction on partitioning, this is slightly violated. self.assertLess(dfsch_t16.total_cost, dfsch_l1.total_cost * 0.72)
def do_scheduling(args): ''' Get optimal scheduling for given problem. Return a result schedule. ''' ## Network. network = import_network(args.net) batch_size = args.batch ## Resource. dim_nodes = PhyDim2(*args.nodes) dim_array = PhyDim2(*args.array) # Sizes of gbuf and regf are in words. word = (args.word + 7) / 8 size_gbuf = args.gbuf / word size_regf = args.regf / word proc_region = NodeRegion(dim=dim_nodes, origin=PhyDim2(0, 0), type=NodeRegion.PROC) if args.mem_type == '2D': # Memory nodes are on two sides. data_regions = (NodeRegion(dim=PhyDim2(h=dim_nodes.h, w=1), origin=PhyDim2(h=0, w=0), type=NodeRegion.DATA), NodeRegion(dim=PhyDim2(h=dim_nodes.h, w=1), origin=PhyDim2(h=0, w=dim_nodes.w - 1), type=NodeRegion.DATA)) elif args.mem_type == '3D': # All nodes have memory. data_regions = (NodeRegion(dim=dim_nodes, origin=PhyDim2(0, 0), type=NodeRegion.DATA), ) resource = Resource(proc_region=proc_region, data_regions=data_regions, dim_array=dim_array, size_gbuf=size_gbuf, size_regf=size_regf) ## Cost. hier_cost = [0] * me.NUM hier_cost[me.DRAM] = args.hier_cost[0] hier_cost[me.GBUF] = args.hier_cost[1] hier_cost[me.ITCN] = args.hier_cost[2] hier_cost[me.REGF] = args.hier_cost[3] cost = Cost(mac_op=args.op_cost, mem_hier=tuple(hier_cost), noc_hop=args.hop_cost, unit_static=args.unit_static_cost) ## Options. bypass = [True] * de.NUM bypass[de.IFM] = 'i' not in args.disable_bypass bypass[de.OFM] = 'o' not in args.disable_bypass bypass[de.FIL] = 'f' not in args.disable_bypass options = Option(sw_gbuf_bypass=tuple(bypass), sw_solve_loopblocking=args.solve_loopblocking, partition_hybrid=args.hybrid_partition, partition_batch=args.batch_partition, partition_ifmaps=args.ifmaps_partition, ntops=args.top, nprocesses=args.processes, verbose=args.verbose) ## Search schedules. nnd = NNDataflow(network, batch_size, resource, cost, MapStrategyEyeriss) tops, cache_stats = nnd.schedule_search(options) if not tops: sys.stderr.write('No valid dataflow found.\n') return None top = tops[0] ## Write results. res_map = OrderedDict() res_map['version'] = get_version(with_local=True) res_map['net'] = args.net res_map['batch'] = args.batch res_map['resource'] = resource._asdict() res_map['cost'] = cost._asdict() res_map['options'] = options._asdict() res_map['cache_stats'] = cache_stats stats = stats_dict(top, cost) for key, val in stats.items(): res_map[key] = val return res_map
class TestNNDataflow(unittest.TestCase): ''' Tests for NNDataflow module. ''' def setUp(self): self.alex_net = import_network('alex_net') self.vgg_net = import_network('vgg_net') self.map_strategy = MapStrategyEyeriss self.resource = Resource( proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.PROC), data_regions=(NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DATA), ), dim_array=PhyDim2(16, 16), size_gbuf=128 * 1024 // 2, # 128 kB size_regf=512 // 2, # 512 B ) self.cost = Cost(mac_op=1, mem_hier=(200, 6, 2, 1), noc_hop=0, unit_static=0) self.options = Option() def test_invalid_network(self): ''' Invalid network argument. ''' with self.assertRaisesRegexp(TypeError, 'NNDataflow: .*network.*'): _ = NNDataflow(self.alex_net.input_layer(), 4, self.resource, self.cost, self.map_strategy) def test_invalid_resource(self): ''' Invalid network argument. ''' with self.assertRaisesRegexp(TypeError, 'NNDataflow: .*resource.*'): _ = NNDataflow(self.alex_net, 4, self.resource.proc_region, self.cost, self.map_strategy) def test_invalid_cost(self): ''' Invalid network argument. ''' with self.assertRaisesRegexp(TypeError, 'NNDataflow: .*cost.*'): _ = NNDataflow(self.alex_net, 4, self.resource, self.cost._asdict(), self.map_strategy) def test_invalid_map_strategy(self): ''' Invalid map_strategy argument. ''' class _DummyClass(object): # pylint: disable=too-few-public-methods pass with self.assertRaisesRegexp(TypeError, 'NNDataflow: .*map_strategy.*'): _ = NNDataflow(self.alex_net, 4, self.resource, self.cost, _DummyClass) def test_verbose(self): ''' Verbose mode. ''' network = self.alex_net batch_size = 16 options = Option(sw_gbuf_bypass=(True, True, True), sw_solve_loopblocking=True, verbose=True) nnd = NNDataflow(network, batch_size, self.resource, self.cost, self.map_strategy) old_stdout = sys.stdout old_stderr = sys.stderr sys.stdout = stdout = StringIO.StringIO() sys.stderr = stderr = StringIO.StringIO() tops, _ = nnd.schedule_search(options) sys.stdout = old_stdout sys.stderr = old_stderr stdout_value = stdout.getvalue() stderr_value = stderr.getvalue() stdout.close() stderr.close() self.assertTrue(tops) self.assertFalse(stdout_value) for layer in network: self.assertIn(layer, stderr_value) def test_no_valid_dataflow(self): ''' No valid dataflow is found. ''' # Very small REGF. self.resource = Resource( proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.PROC), data_regions=(NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DATA), ), dim_array=PhyDim2(16, 16), size_gbuf=128 * 1024 // 2, # 128 kB size_regf=2, ) nnd = NNDataflow(self.alex_net, 4, self.resource, self.cost, self.map_strategy) tops, _ = nnd.schedule_search(self.options) self.assertFalse(tops) def test_scheduling_failure(self): ''' Layer scheduling failure. ''' network = self.alex_net batch_size = 16 nnd = NNDataflow(network, batch_size, self.resource, self.cost, MapStrategy) old_stdout = sys.stdout old_stderr = sys.stderr sys.stdout = stdout = StringIO.StringIO() sys.stderr = stderr = StringIO.StringIO() with self.assertRaises(NotImplementedError): _ = nnd.schedule_search(self.options) sys.stdout = old_stdout sys.stderr = old_stderr stdout_value = stdout.getvalue() stderr_value = stderr.getvalue() stdout.close() stderr.close() self.assertFalse(stdout_value) self.assertIn('Failed', stderr_value) def test_eyeriss_isca16(self): ''' Reproduce Eyeriss ISCA'16 paper Fig. 10. ''' network = self.alex_net batch_size = 16 nnd = NNDataflow(network, batch_size, self.resource, self.cost, self.map_strategy) tops, _ = nnd.schedule_search(self.options) self.assertTrue(tops) dfsch = tops[0] ## Check results. # Results as cost for each component: header = 'ALU, DRAM, Buffer, Array, RF' cost_bkdn = {} for layer in ['conv{}'.format(i) for i in range(1, 6)] \ + ['fc{}'.format(i) for i in range(1, 4)]: op_cost = 0 access_cost = [0] * me.NUM for layer_part in network: if not layer_part or not layer_part.startswith(layer): continue sr = dfsch[layer_part] op_cost += sr.total_ops * self.cost.mac_op access_cost = [ ac + a * c for ac, a, c in zip( access_cost, sr.total_accesses, self.cost.mem_hier) ] cost_bkdn[layer] = [] # To 1e9. cost_bkdn[layer].append(op_cost / 1e9) cost_bkdn[layer].append(access_cost[me.DRAM] / 1e9) cost_bkdn[layer].append(access_cost[me.GBUF] / 1e9) cost_bkdn[layer].append(access_cost[me.ITCN] / 1e9) cost_bkdn[layer].append(access_cost[me.REGF] / 1e9) # Check the major parts: ALU, DRAM, RF. major_cost_bkdn_ref = { 'conv1': [1.69, 2.46, 6.75], 'conv2': [3.58, 2.27, 14.33], 'conv3': [2.39, 2.02, 9.57], 'conv4': [1.79, 1.57, 7.18], 'conv5': [1.20, 1.05, 4.78], 'fc1': [0.60, 7.78, 2.42], 'fc2': [0.27, 3.39, 1.07], 'fc3': [0.07, 0.84, 0.26], } for layer in cost_bkdn: success = all( abs(a - b) < 0.1 for a, b in zip(cost_bkdn[layer][:2] + cost_bkdn[layer][-1:], major_cost_bkdn_ref[layer])) self.assertTrue( success, 'test_eyeriss_isca16: ' 'ALU, DRAM, RF cost diff in layer {}.\n' 'header: {}\n' 'actual: {}\nref: {}'.format(layer, header, cost_bkdn[layer], major_cost_bkdn_ref[layer])) def test_eyeriss_isscc16(self): ''' Reproduce Eyeriss ISSCC'16 paper Fig. 14.5.6, JSSC'17 paper Table V. ''' network = self.alex_net batch_size = 4 resource = Resource( proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.PROC), data_regions=(NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DATA), ), dim_array=PhyDim2(12, 14), size_gbuf=108 * 1024 // 2, # 108 kB size_regf=261, # 225 + 12 + 24 ) cost = Cost( mac_op=2e-12, mem_hier=(460e-12, 15e-12, 4e-12, 1e-12), # pJ/16-b noc_hop=0, unit_static=30e-3 / 200e6) # 30 mW GBUF + REGF nnd = NNDataflow(network, batch_size, resource, cost, self.map_strategy) tops, _ = nnd.schedule_search(self.options) self.assertTrue(tops) dfsch = tops[0] ## Check results. # Results as stats of the rows in the table. header = 'Power, Processing Latency, Ops, Active PEs, Filter size' stats = {} for layer in ['conv{}'.format(i) for i in range(1, 6)]: onchip_cost = 0 time = 0 ops = 0 fil_size = 0 for layer_part in network: if not layer_part or not layer_part.startswith(layer): continue sr = dfsch[layer_part] onchip_cost += sr.total_cost \ - sr.total_accesses[me.DRAM] * cost.mem_hier[me.DRAM] time += sr.total_time ops += sr.total_ops fil_size += network[layer_part].total_filter_size() power = onchip_cost / (time / 200e6) * 1e3 # mW active_pes = int(ops / time) stats[layer] = [] stats[layer].append(power) stats[layer].append(time / 200.e3) # cycles to ms stats[layer].append(ops / 1e6) # to MOPs stats[layer].append(active_pes) stats[layer].append(fil_size / 1e3) # to k # Check. stats_ref = { 'conv1': [332, 16.5, 421.66, 151, 34.8], # Act PE 154 'conv2': [288, 39.2, 895.79, 135, 307.2], 'conv3': [266, 21.8, 598.1, 156, 884.7], 'conv4': [235, 16.0, 448.6, 156, 663.6], 'conv5': [236, 10.0, 299.0, 156, 442.4], } for layer in stats: success = (0.6 * stats_ref[layer][0] < stats[layer][0] < stats_ref[layer][0]) \ and (0.8 * stats_ref[layer][1] < stats[layer][1] < stats_ref[layer][1]) \ and all(abs(a - b) < 0.1 for a, b in zip(stats[layer][2:], stats_ref[layer][2:])) self.assertTrue( success, 'test_eyeriss_isscc16: ' 'stats diff in layer {}.\n' 'header: {}\n' 'actual: {}\nref: {}'.format(layer, header, stats[layer], stats_ref[layer])) def test_eyeriss_asplos17(self): ''' Reproduce TETRIS ASPLOS'17 paper Figure 8. ''' network = self.alex_net batch_size = 16 ## L-1 configuration. resource = Resource( proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.PROC), data_regions=(NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DATA), ), dim_array=PhyDim2(16, 16), size_gbuf=576056 // 2, # 576 kB size_regf=1024 // 2, # 1 kB ) cost = Cost( mac_op=2e-12, mem_hier=(240e-12, 28e-12, 4e-12, 1e-12), # pJ/16-b noc_hop=0, unit_static=320e-12) nnd = NNDataflow(network, batch_size, resource, cost, self.map_strategy) tops, _ = nnd.schedule_search(self.options) self.assertTrue(tops) dfsch_l1 = tops[0] ## T-16 configuration. resource = Resource( proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.PROC), data_regions=(NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.DATA), ), dim_array=PhyDim2(14, 14), size_gbuf=133032 // 2, # 133 kB size_regf=512 // 2, # 512 B ) cost = Cost( mac_op=2e-12, mem_hier=(80e-12, 14e-12, 4e-12, 0.6e-12), # pJ/16-b noc_hop=40e-12, unit_static=200e-12) options = Option(sw_gbuf_bypass=(True, True, True), sw_solve_loopblocking=True, partition_hybrid=True) nnd = NNDataflow(network, batch_size, resource, cost, self.map_strategy) tops, _ = nnd.schedule_search(options) self.assertTrue(tops) dfsch_t16 = tops[0] ## Check results. # Same workload. self.assertAlmostEqual(dfsch_t16.total_ops, dfsch_l1.total_ops) # Performance of T-16 is proportional to PE resource (20% margin). self.assertLess(dfsch_t16.total_time, 1.2 * dfsch_l1.total_time * (16 * 16) / (14 * 14 * 16)) # Energy reduced by > 30%. self.assertLess(dfsch_t16.total_cost, dfsch_l1.total_cost * 0.7)