def test_cycles_per_layer(self): l = CaffeLoader( None, "./FINN/inputs/dorefanet-pruned-without-extra-messages.prototxt") net = nn.NN(l) dev = device.Device('XLNX:KU115.json') perfmodel = pm.PerfModel(net, dev) fps = perfmodel.maximise_fps() for idx, layer in enumerate(net.layers): in_chans = net.layers[idx].getInputSize() out_chans = net.layers[idx].getOutputSize() out_dim = net.layers[idx].get_out_dim() if isinstance(in_chans, tuple): print in_chans in_chans = in_chans[0] if isinstance(out_chans, tuple): print out_chans out_chans = out_chans[0] if isinstance(out_dim, tuple): print out_dim out_dim = out_dim[0] print perfmodel.SIMD[idx], in_chans print perfmodel.PE[idx], out_chans print perfmodel.MMV[idx], out_dim self.assertLessEqual(perfmodel.SIMD[idx], in_chans) self.assertLessEqual(perfmodel.PE[idx], out_chans) self.assertLessEqual(perfmodel.MMV[idx], out_dim)
def test_simd_pe_mmv_constraints(self): l = CaffeLoader(None, "./FINN/inputs/sfc.prototxt") net = nn.NN(l) dev = device.Device('XLNX:KU115.json') perfmodel = pm.PerfModel(net, dev) fps = perfmodel.maximise_fps() for idx, layer in enumerate(net.layers): self.assertLessEqual(perfmodel.SIMD[idx], layer.getInputSize()) self.assertLessEqual(perfmodel.PE[idx], layer.getOutputSize()) self.assertLessEqual(perfmodel.MMV[idx], layer.get_out_dim())
def test_cycles_per_op(self): l = CaffeLoader("./FINN/inputs/sfc.caffemodel", "./FINN/inputs/sfc.prototxt") net = nn.NN(l) dev = device.Device('XLNX:VU9P.json') perfmodel = pm.PerfModel(net, dev) ops = perfmodel.network_utilisation() num_matrix_layers = net.count_matrix_layers() self.assertEqual(ops['luts'], 2 * num_matrix_layers * dev.lut_cost_per_op())
def res_alloc_predetermined(pipeline, net, dev): ret_pipeline = copy.deepcopy(pipeline) print "PIPELINE: ", ret_pipeline net.layers = ret_pipeline perfmodel = pm.PerfModel(net, dev) fps = perfmodel.maximise_fps() for i in range(len(ret_pipeline)): ret_pipeline[i].simd = perfmodel.SIMD[i] print "SIMD:", ret_pipeline[i].simd ret_pipeline[i].pe = perfmodel.PE[i] print "PE:", ret_pipeline[i].pe return ret_pipeline
def demo_lfc(): logging.basicConfig( filename='FINN.log', level=logging.INFO) # Changed WARNING to INFO if you want logging lfcnetwork = [] W0 = np.zeros((1024, 832)) # OutChans, InChans W1 = np.zeros((1024, 1024)) W2 = np.zeros((1024, 1024)) W3 = np.zeros((64, 1024)) lfcnetwork.append(layers.FullyConnectedLayer(W0, 1, 1, 1)) # wbits, ibits, obits lfcnetwork.append(layers.FullyConnectedLayer(W1, 1, 1, 1)) lfcnetwork.append(layers.FullyConnectedLayer(W2, 1, 1, 1)) lfcnetwork.append(layers.FullyConnectedLayer(W3, 1, 1, 1)) net = FINN.core.nn.NN(layers=lfcnetwork) dev = device.Device('XLNX:VU9P.json', frequency=192.4) perf = perf_model.PerfModel(net, dev) fps = perf.maximise_fps() # perf.SIMD[0] = 64 # perf.SIMD[1] = 64 # perf.SIMD[2] = 64 # perf.SIMD[3] = 64 # # perf.PE[0] = 256 # perf.PE[1] = 256 # perf.PE[2] = 256 # perf.PE[3] = 16 fps = perf.fps() perf.nswg.calculate_neural_folding() perf.nswg.calculate_write_block_cycles() perf.nswg.calculate_read_block_cycles() perf.nswg.calculate_total_cycles() perf.nswg.calculate_input_multipliers() perf.print_folding_factors() perf.print_hardware_cost() perf.print_topology() perf.print_cycles() fps = perf.fps() print "Achieved fps of %f with %f%% LUT utilisation and %f%% BRAM utilisation at %f Mhz" % ( fps, perf.network_utilisation()['luts'] / dev.luts * 100, perf.network_utilisation()['brams'] / dev.brams * 100, dev.frequency)
def demo_hwgq_import(): l = CaffeLoader(None, "inputs/sfc.prototxt") net = FINN.core.nn.NN(l) dev = device.Device('XLNX:KU115.json') perf = perf_model.PerfModel(net, dev) perf.print_folding_factors() perf.print_hardware_cost() for idx, val in enumerate(perf.SIMD): perf.SIMD[idx] = 5 #perf.PE[idx] = 10 perf.print_folding_factors() perf.print_hardware_cost() for idx, val in enumerate(perf.SIMD): perf.SIMD[idx] = 20 #perf.PE[idx] = 100 perf.print_folding_factors() perf.print_hardware_cost()
def demo_sfc(): logging.basicConfig( filename='FINN.log', level=logging.INFO) # Changed WARNING to INFO if you want logging sfcnetwork = [] W0 = np.zeros((64, 3, 3, 3)) # out, in, kernel, kernel W1 = np.zeros((64, 64, 3, 3)) W2 = np.zeros((128, 64, 3, 3)) W3 = np.zeros((128, 128, 3, 3)) W4 = np.zeros((256, 128, 3, 3)) W5 = np.zeros((256, 256, 3, 3)) W6 = np.zeros((512, 256)) W7 = np.zeros((512, 512)) W8 = np.zeros((10, 512)) sfcnetwork.append(layers.ConvolutionLayer( W0, 32, 0, 1, 1, 1, 1, 0)) # in_dim, pad, stride, wbits, ibits, obits sfcnetwork.append(layers.ConvolutionLayer(W1, 30, 0, 1, 1, 1, 1, 0)) sfcnetwork.append(layers.ConvolutionLayer(W2, 14, 0, 1, 1, 1, 1, 0)) sfcnetwork.append(layers.ConvolutionLayer(W3, 12, 0, 1, 1, 1, 1, 0)) sfcnetwork.append(layers.ConvolutionLayer(W4, 5, 0, 1, 1, 1, 1, 0)) sfcnetwork.append(layers.ConvolutionLayer(W5, 3, 0, 1, 1, 1, 1, 0)) sfcnetwork.append(layers.FullyConnectedLayer(W6, 1, 1, 1)) sfcnetwork.append(layers.FullyConnectedLayer(W7, 1, 1, 1)) sfcnetwork.append(layers.FullyConnectedLayer(W8, 1, 1, 1)) net = FINN.core.nn.NN(layers=sfcnetwork) dev = device.Device('XLNX:VU9P.json', frequency=248.5) # Measured on AWS perf = perf_model.PerfModel(net, dev) fps = perf.maximise_fps() # From BNN spreadsheet, t3 perf.SIMD[0] = 3 perf.SIMD[1] = 64 perf.SIMD[2] = 64 perf.SIMD[3] = 64 perf.SIMD[4] = 64 perf.SIMD[5] = 64 perf.SIMD[6] = 16 perf.SIMD[7] = 16 perf.SIMD[8] = 16 perf.PE[0] = 64 perf.PE[1] = 64 perf.PE[2] = 64 perf.PE[3] = 64 perf.PE[4] = 64 perf.PE[5] = 64 perf.PE[6] = 16 perf.PE[7] = 16 perf.PE[8] = 4 perf.MMV[0] = 1 perf.MMV[1] = 1 perf.MMV[2] = 1 perf.MMV[3] = 1 perf.MMV[4] = 1 perf.MMV[5] = 1 perf.MMV[6] = 1 perf.MMV[7] = 1 perf.MMV[8] = 1 # FPS given the above folding factors fps = perf.fps() perf.nswg.calculate_neural_folding() perf.nswg.calculate_write_block_cycles() perf.nswg.calculate_read_block_cycles() perf.nswg.calculate_total_cycles() perf.nswg.calculate_input_multipliers() perf.print_folding_factors() perf.print_hardware_cost() perf.print_topology() perf.print_cycles() fps = perf.fps() print(perf.nswg) print "Achieved fps of %f with %f%% LUT utilisation and %f%% BRAM utilisation at %f Mhz" % ( fps, perf.network_utilisation()['luts'] / dev.luts * 100, perf.network_utilisation()['brams'] / dev.brams * 100, dev.frequency)
def demo_dorefa(): logging.basicConfig( filename='FINN.log', level=logging.INFO) # Changed WARNING to INFO if you want logging dorefanetwork = [] W0 = np.zeros((68, 3, 12, 12)) # out, in, kernel, kernel W1 = np.zeros((90, 34, 5, 5)) W2 = np.zeros((272, 180, 3, 3)) W3 = np.zeros((192, 136, 3, 3)) W4 = np.zeros((128, 192, 3, 3)) W5 = np.zeros((4096, 9216)) W6 = np.zeros((4096, 4096)) W7 = np.zeros((1000, 4096)) dorefanetwork.append(layers.ConvolutionLayer( W0, 227, 0, 4, 1, 1, 1, 0)) # in_dim, pad, stride, wbits, ibits, obits dorefanetwork.append(layers.ConvolutionLayer(W1, 58, 0, 1, 1, 1, 1, 0)) dorefanetwork[-1].parallel = 2 dorefanetwork.append(layers.ConvolutionLayer(W2, 29, 0, 1, 1, 1, 1, 0)) dorefanetwork.append(layers.ConvolutionLayer(W3, 16, 0, 1, 1, 1, 1, 0)) dorefanetwork[-1].parallel = 2 dorefanetwork.append(layers.ConvolutionLayer(W4, 16, 0, 1, 1, 1, 1, 0)) dorefanetwork[-1].parallel = 2 dorefanetwork.append(layers.FullyConnectedLayer(W5, 1, 1, 1)) dorefanetwork.append(layers.FullyConnectedLayer(W6, 1, 1, 1)) dorefanetwork.append(layers.FullyConnectedLayer(W7, 1, 1, 1)) net = FINN.core.nn.NN(layers=dorefanetwork) dev = device.Device('XLNX:VU9P.json', frequency=101) # Measured on AWS perf = perf_model.PerfModel(net, dev) # From BNN spreadsheet, t3 perf.SIMD[0] = 3 perf.SIMD[1] = 34 perf.SIMD[2] = 45 perf.SIMD[3] = 34 perf.SIMD[4] = 64 perf.SIMD[5] = 64 perf.SIMD[6] = 64 perf.SIMD[7] = 8 perf.PE[0] = 68 perf.PE[1] = 90 perf.PE[2] = 136 perf.PE[3] = 64 perf.PE[4] = 32 perf.PE[5] = 32 perf.PE[6] = 16 perf.PE[7] = 32 perf.MMV[0] = 18 perf.MMV[1] = 3 perf.MMV[2] = 3 perf.MMV[3] = 1 perf.MMV[4] = 1 perf.MMV[5] = 1 perf.MMV[6] = 1 perf.MMV[7] = 1 # FPS given the above folding factors fps = perf.fps() perf.nswg.calculate_neural_folding() perf.nswg.calculate_write_block_cycles() perf.nswg.calculate_read_block_cycles() perf.nswg.calculate_total_cycles() perf.nswg.calculate_input_multipliers() perf.print_folding_factors() perf.print_hardware_cost() perf.print_topology() perf.print_cycles() fps = perf.fps() print(perf.nswg) print "Achieved fps of %f with %f%% LUT utilisation and %f%% BRAM utilisation at %f Mhz" % ( fps, perf.network_utilisation()['luts'] / dev.luts * 100, perf.network_utilisation()['brams'] / dev.brams * 100, dev.frequency)
# derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import FINN.core.perf_model as pm import FINN.core.device as device import FINN.core.nn as nn from FINN.frontend.caffeloader import CaffeLoader print "Hardware model" frequency = 200 #dev = device.Device('XLNX:PYNQ-Z1.json', frequency) dev = device.Device('XLNX:VU9P.json', frequency) net = nn.NN(CaffeLoader(None, './FINN/inputs/lfc-w1a1.prototxt')) perfmodel = pm.PerfModel(net, dev) fps = perfmodel.maximise_fps() print "Network Utilization" print perfmodel.network_utilisation()['luts']/dev.luts*100 print perfmodel.network_utilisation()['brams']/dev.brams*100