def test_fpgabackend_rawhls(self): # resource allocation function to set number of PE/SIMD per layer # the allocation is statically determined for this test case. def res_alloc_predetermined(pipeline, net, dev): ret_pipeline = copy.deepcopy(pipeline) layer_simd = [16, 64, 64, 64] layer_pe = [64, 64, 64, 10] for i in range(4): ret_pipeline[i].simd = layer_simd[i] ret_pipeline[i].pe = layer_pe[i] return ret_pipeline # make a temp dir for generated HLS dirpath = tempfile.mkdtemp() # pick all layers except first (input quantization) and last # (final batchnorm) of the streamlined network hlslayers = self.streamlined_net.layers[1:-1] # call the FPGA backend to generate HLS and compile raw HLS sim dev = device.Device('XLNX:PYNQ-Z1.json', 100) ret = fpga_backend.synthesize(hlslayers, self.net, dev, res_alloc_predetermined, dirpath, "sfcall-") hlspipeline = ret.getSimLayer() # build a "mixed pipeline", where the first and last layers are in # device-neutral simulation, and everything in the middle is handled # by the HLS sim executable mixed_pipeline = [self.streamlined_net.layers[0]] + hlspipeline + [self.streamlined_net.layers[-1]] # test on MNIST (ok, nok) = testOnMNIST(nn.NN(layers=mixed_pipeline), self.numImagesToTest) # remove temp dir #shutil.rmtree(dirpath) self.assertTrue(ok == self.ok_golden and nok == self.nok_golden)
def test_cycles_per_layer(self): l = CaffeLoader( None, "./FINN/inputs/dorefanet-pruned-without-extra-messages.prototxt") net = nn.NN(l) dev = device.Device('XLNX:KU115.json') perfmodel = pm.PerfModel(net, dev) fps = perfmodel.maximise_fps() for idx, layer in enumerate(net.layers): in_chans = net.layers[idx].getInputSize() out_chans = net.layers[idx].getOutputSize() out_dim = net.layers[idx].get_out_dim() if isinstance(in_chans, tuple): print in_chans in_chans = in_chans[0] if isinstance(out_chans, tuple): print out_chans out_chans = out_chans[0] if isinstance(out_dim, tuple): print out_dim out_dim = out_dim[0] print perfmodel.SIMD[idx], in_chans print perfmodel.PE[idx], out_chans print perfmodel.MMV[idx], out_dim self.assertLessEqual(perfmodel.SIMD[idx], in_chans) self.assertLessEqual(perfmodel.PE[idx], out_chans) self.assertLessEqual(perfmodel.MMV[idx], out_dim)
def test_simd_pe_mmv_constraints(self): l = CaffeLoader(None, "./FINN/inputs/sfc.prototxt") net = nn.NN(l) dev = device.Device('XLNX:KU115.json') perfmodel = pm.PerfModel(net, dev) fps = perfmodel.maximise_fps() for idx, layer in enumerate(net.layers): self.assertLessEqual(perfmodel.SIMD[idx], layer.getInputSize()) self.assertLessEqual(perfmodel.PE[idx], layer.getOutputSize()) self.assertLessEqual(perfmodel.MMV[idx], layer.get_out_dim())
def test_cycles_per_op(self): l = CaffeLoader("./FINN/inputs/sfc.caffemodel", "./FINN/inputs/sfc.prototxt") net = nn.NN(l) dev = device.Device('XLNX:VU9P.json') perfmodel = pm.PerfModel(net, dev) ops = perfmodel.network_utilisation() num_matrix_layers = net.count_matrix_layers() self.assertEqual(ops['luts'], 2 * num_matrix_layers * dev.lut_cost_per_op())
def demo_lfc(): logging.basicConfig( filename='FINN.log', level=logging.INFO) # Changed WARNING to INFO if you want logging lfcnetwork = [] W0 = np.zeros((1024, 832)) # OutChans, InChans W1 = np.zeros((1024, 1024)) W2 = np.zeros((1024, 1024)) W3 = np.zeros((64, 1024)) lfcnetwork.append(layers.FullyConnectedLayer(W0, 1, 1, 1)) # wbits, ibits, obits lfcnetwork.append(layers.FullyConnectedLayer(W1, 1, 1, 1)) lfcnetwork.append(layers.FullyConnectedLayer(W2, 1, 1, 1)) lfcnetwork.append(layers.FullyConnectedLayer(W3, 1, 1, 1)) net = FINN.core.nn.NN(layers=lfcnetwork) dev = device.Device('XLNX:VU9P.json', frequency=192.4) perf = perf_model.PerfModel(net, dev) fps = perf.maximise_fps() # perf.SIMD[0] = 64 # perf.SIMD[1] = 64 # perf.SIMD[2] = 64 # perf.SIMD[3] = 64 # # perf.PE[0] = 256 # perf.PE[1] = 256 # perf.PE[2] = 256 # perf.PE[3] = 16 fps = perf.fps() perf.nswg.calculate_neural_folding() perf.nswg.calculate_write_block_cycles() perf.nswg.calculate_read_block_cycles() perf.nswg.calculate_total_cycles() perf.nswg.calculate_input_multipliers() perf.print_folding_factors() perf.print_hardware_cost() perf.print_topology() perf.print_cycles() fps = perf.fps() print "Achieved fps of %f with %f%% LUT utilisation and %f%% BRAM utilisation at %f Mhz" % ( fps, perf.network_utilisation()['luts'] / dev.luts * 100, perf.network_utilisation()['brams'] / dev.brams * 100, dev.frequency)
def setUp(self): nname = "lfc-w1a1" proto = FINN_ROOT + "/inputs/%s.prototxt" % nname weights = FINN_ROOT + "/inputs/%s.caffemodel" % nname l = CaffeLoader(weights, proto) self.net = nn.NN(l) frequency = 300 self.dev = device.Device('XLNX:VU9P.json', frequency) self.streamlined_net = copy.deepcopy(self.net) print self.streamlined_net.layers self.streamlined_net.layers = transform.makeCromulent( self.streamlined_net.layers) print self.streamlined_net.layers # use the first numImagesToTest of the test set for verification self.numImagesToTest = 1000 # expected number of successful predictions self.ok_golden = 967 # expected number of unsuccessful predictions self.nok_golden = 33
def demo_hwgq_import(): l = CaffeLoader(None, "inputs/sfc.prototxt") net = FINN.core.nn.NN(l) dev = device.Device('XLNX:KU115.json') perf = perf_model.PerfModel(net, dev) perf.print_folding_factors() perf.print_hardware_cost() for idx, val in enumerate(perf.SIMD): perf.SIMD[idx] = 5 #perf.PE[idx] = 10 perf.print_folding_factors() perf.print_hardware_cost() for idx, val in enumerate(perf.SIMD): perf.SIMD[idx] = 20 #perf.PE[idx] = 100 perf.print_folding_factors() perf.print_hardware_cost()
def demo_sfc(): logging.basicConfig( filename='FINN.log', level=logging.INFO) # Changed WARNING to INFO if you want logging sfcnetwork = [] W0 = np.zeros((64, 3, 3, 3)) # out, in, kernel, kernel W1 = np.zeros((64, 64, 3, 3)) W2 = np.zeros((128, 64, 3, 3)) W3 = np.zeros((128, 128, 3, 3)) W4 = np.zeros((256, 128, 3, 3)) W5 = np.zeros((256, 256, 3, 3)) W6 = np.zeros((512, 256)) W7 = np.zeros((512, 512)) W8 = np.zeros((10, 512)) sfcnetwork.append(layers.ConvolutionLayer( W0, 32, 0, 1, 1, 1, 1, 0)) # in_dim, pad, stride, wbits, ibits, obits sfcnetwork.append(layers.ConvolutionLayer(W1, 30, 0, 1, 1, 1, 1, 0)) sfcnetwork.append(layers.ConvolutionLayer(W2, 14, 0, 1, 1, 1, 1, 0)) sfcnetwork.append(layers.ConvolutionLayer(W3, 12, 0, 1, 1, 1, 1, 0)) sfcnetwork.append(layers.ConvolutionLayer(W4, 5, 0, 1, 1, 1, 1, 0)) sfcnetwork.append(layers.ConvolutionLayer(W5, 3, 0, 1, 1, 1, 1, 0)) sfcnetwork.append(layers.FullyConnectedLayer(W6, 1, 1, 1)) sfcnetwork.append(layers.FullyConnectedLayer(W7, 1, 1, 1)) sfcnetwork.append(layers.FullyConnectedLayer(W8, 1, 1, 1)) net = FINN.core.nn.NN(layers=sfcnetwork) dev = device.Device('XLNX:VU9P.json', frequency=248.5) # Measured on AWS perf = perf_model.PerfModel(net, dev) fps = perf.maximise_fps() # From BNN spreadsheet, t3 perf.SIMD[0] = 3 perf.SIMD[1] = 64 perf.SIMD[2] = 64 perf.SIMD[3] = 64 perf.SIMD[4] = 64 perf.SIMD[5] = 64 perf.SIMD[6] = 16 perf.SIMD[7] = 16 perf.SIMD[8] = 16 perf.PE[0] = 64 perf.PE[1] = 64 perf.PE[2] = 64 perf.PE[3] = 64 perf.PE[4] = 64 perf.PE[5] = 64 perf.PE[6] = 16 perf.PE[7] = 16 perf.PE[8] = 4 perf.MMV[0] = 1 perf.MMV[1] = 1 perf.MMV[2] = 1 perf.MMV[3] = 1 perf.MMV[4] = 1 perf.MMV[5] = 1 perf.MMV[6] = 1 perf.MMV[7] = 1 perf.MMV[8] = 1 # FPS given the above folding factors fps = perf.fps() perf.nswg.calculate_neural_folding() perf.nswg.calculate_write_block_cycles() perf.nswg.calculate_read_block_cycles() perf.nswg.calculate_total_cycles() perf.nswg.calculate_input_multipliers() perf.print_folding_factors() perf.print_hardware_cost() perf.print_topology() perf.print_cycles() fps = perf.fps() print(perf.nswg) print "Achieved fps of %f with %f%% LUT utilisation and %f%% BRAM utilisation at %f Mhz" % ( fps, perf.network_utilisation()['luts'] / dev.luts * 100, perf.network_utilisation()['brams'] / dev.brams * 100, dev.frequency)
def demo_dorefa(): logging.basicConfig( filename='FINN.log', level=logging.INFO) # Changed WARNING to INFO if you want logging dorefanetwork = [] W0 = np.zeros((68, 3, 12, 12)) # out, in, kernel, kernel W1 = np.zeros((90, 34, 5, 5)) W2 = np.zeros((272, 180, 3, 3)) W3 = np.zeros((192, 136, 3, 3)) W4 = np.zeros((128, 192, 3, 3)) W5 = np.zeros((4096, 9216)) W6 = np.zeros((4096, 4096)) W7 = np.zeros((1000, 4096)) dorefanetwork.append(layers.ConvolutionLayer( W0, 227, 0, 4, 1, 1, 1, 0)) # in_dim, pad, stride, wbits, ibits, obits dorefanetwork.append(layers.ConvolutionLayer(W1, 58, 0, 1, 1, 1, 1, 0)) dorefanetwork[-1].parallel = 2 dorefanetwork.append(layers.ConvolutionLayer(W2, 29, 0, 1, 1, 1, 1, 0)) dorefanetwork.append(layers.ConvolutionLayer(W3, 16, 0, 1, 1, 1, 1, 0)) dorefanetwork[-1].parallel = 2 dorefanetwork.append(layers.ConvolutionLayer(W4, 16, 0, 1, 1, 1, 1, 0)) dorefanetwork[-1].parallel = 2 dorefanetwork.append(layers.FullyConnectedLayer(W5, 1, 1, 1)) dorefanetwork.append(layers.FullyConnectedLayer(W6, 1, 1, 1)) dorefanetwork.append(layers.FullyConnectedLayer(W7, 1, 1, 1)) net = FINN.core.nn.NN(layers=dorefanetwork) dev = device.Device('XLNX:VU9P.json', frequency=101) # Measured on AWS perf = perf_model.PerfModel(net, dev) # From BNN spreadsheet, t3 perf.SIMD[0] = 3 perf.SIMD[1] = 34 perf.SIMD[2] = 45 perf.SIMD[3] = 34 perf.SIMD[4] = 64 perf.SIMD[5] = 64 perf.SIMD[6] = 64 perf.SIMD[7] = 8 perf.PE[0] = 68 perf.PE[1] = 90 perf.PE[2] = 136 perf.PE[3] = 64 perf.PE[4] = 32 perf.PE[5] = 32 perf.PE[6] = 16 perf.PE[7] = 32 perf.MMV[0] = 18 perf.MMV[1] = 3 perf.MMV[2] = 3 perf.MMV[3] = 1 perf.MMV[4] = 1 perf.MMV[5] = 1 perf.MMV[6] = 1 perf.MMV[7] = 1 # FPS given the above folding factors fps = perf.fps() perf.nswg.calculate_neural_folding() perf.nswg.calculate_write_block_cycles() perf.nswg.calculate_read_block_cycles() perf.nswg.calculate_total_cycles() perf.nswg.calculate_input_multipliers() perf.print_folding_factors() perf.print_hardware_cost() perf.print_topology() perf.print_cycles() fps = perf.fps() print(perf.nswg) print "Achieved fps of %f with %f%% LUT utilisation and %f%% BRAM utilisation at %f Mhz" % ( fps, perf.network_utilisation()['luts'] / dev.luts * 100, perf.network_utilisation()['brams'] / dev.brams * 100, dev.frequency)
# derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import FINN.core.perf_model as pm import FINN.core.device as device import FINN.core.nn as nn from FINN.frontend.caffeloader import CaffeLoader print "Hardware model" frequency = 200 #dev = device.Device('XLNX:PYNQ-Z1.json', frequency) dev = device.Device('XLNX:VU9P.json', frequency) net = nn.NN(CaffeLoader(None, './FINN/inputs/lfc-w1a1.prototxt')) perfmodel = pm.PerfModel(net, dev) fps = perfmodel.maximise_fps() print "Network Utilization" print perfmodel.network_utilisation()['luts']/dev.luts*100 print perfmodel.network_utilisation()['brams']/dev.brams*100