Exemple #1
0
def train(trained_model, trained_optimizer):
    train_data_pattern = '../tutorial_dataset/training.tfrecord-?????-of-?????'
    class_file = '../tutorial_dataset/training.tfrecord.classes'
    eval_data_pattern = '../tutorial_dataset/eval.tfrecord-?????-of-?????'

    param = util_model.Param()
    param.class_num = data.get_num_classes(class_file)
    train_inks, train_lengths, train_labels = data.load_data(
        train_data_pattern, data.SessionMode.TRAIN, param.batch_size)
    eval_inks, eval_lengths, eval_labels = data.load_data(
        eval_data_pattern, data.SessionMode.PREDICT, param.batch_size * 2)

    model = util_model.Model(param)

    saver_model = tf.train.Saver(model.model_variables)
    saver_optimizer = tf.train.Saver(model.optimizer_variables)

    with tf.Session() as sess:
        writer = tf.summary.FileWriter("./nn_logs", sess.graph)
        tf.summary.scalar('cost', model.cross_entropy)
        #merged = tf.summary.merge_all()
        sess.run(tf.global_variables_initializer())

        if trained_model and trained_optimizer:
            saver_model.restore(sess, trained_model)
            saver_optimizer.restore(sess, trained_optimizer)
        else:
            sess.run(tf.global_variables_initializer())

        idx = 0
        while True:
            train_vinks, train_vlengths, train_vlabels = sess.run(
                [train_inks, train_lengths, train_labels])
            vloss, vacc, _ = sess.run(
                [model.cross_entropy, model.accuracy, model.train_op],
                feed_dict={
                    model.if_train: True,
                    model.input_inks: train_vinks,
                    model.input_lengths: train_vlengths,
                    model.input_labels: train_vlabels
                })
            print(vloss, vacc)
            if (idx + 1) % 10 == 0:
                eval_vinks, eval_vlengths, eval_vlabels = sess.run(
                    [eval_inks, eval_lengths, eval_labels])
                vacc = sess.run(model.accuracy,
                                feed_dict={
                                    model.if_train: False,
                                    model.input_inks: eval_vinks,
                                    model.input_lengths: eval_vlengths,
                                    model.input_labels: eval_vlabels
                                })
                print('val acc: ', vacc)

            idx = idx + 1
            #writer.add_summary(vsummary, i)

        writer.close()
        saver_model.save(sess, 'mdl/model.ckpt')
        saver_optimizer.save(sess, 'mdl/optimizer.ckpt')
def main():
    # parsing the input
    args = parser.parse_args()

    # read checkpoint file
    checkpoint_file = args.ckp_file
    if checkpoint_file == '':
        print('No checkpoint file provided!')
        exit(1)

    if os.path.isfile(checkpoint_file):
        checkpoint_loaded = torch.load(checkpoint_file)
        print('Model pretrained at ', checkpoint_file)
    else:
        print('Model pretrainined not Loaded')
        exit(1)

    # Load configuration from checkpoint
    model_name = checkpoint_loaded['model']
    model_config = checkpoint_loaded['config']
    activ_bits = model_config['activ_bits']
    activ_type = model_config['activ_type']
    dataset = model_config['dataset']
    input_dim = model_config['input_dim']
    input_size = input_dim
    num_classes = model_config['num_classes']
    type_quant = model_config['type_quant']
    weight_bits = model_config['weight_bits']
    width_mult = model_config['width_mult']
    additional_config = ''
    quant_add_config = checkpoint_loaded['add_config']
    fold_type = checkpoint_loaded['fold_type']

    # Load quantizer & model state
    quantizer_load = checkpoint_loaded['quantizer']
    model_state = checkpoint_loaded['state_dict']

    # Create model with same characteristics
    model = models.__dict__[model_name]
    nClasses = get_num_classes(dataset)
    model_config = {'input_size': input_size, 'dataset': dataset, 'num_classes': nClasses, \
                    'type_quant': type_quant, 'weight_bits': weight_bits, 'activ_bits': activ_bits,\
                    'activ_type': activ_type, 'width_mult': width_mult, 'input_dim': input_size }
    if additional_config is not '':
        model_config = dict(model_config, **literal_eval(additional_config))
    model = models.__dict__[model_name]
    model = model(**model_config, pretrained=False)
    if model is None:
        print('ERROR: model is none')
        exit(1)

    # wrap the model with quantop operator
    dummy_input = torch.Tensor(1, 3, int(input_dim), int(input_dim))
    quantizer = quantization.QuantOp(model, quant_type = type_quant, weight_bits = weight_bits,bias_bits =32, \
                                    batch_fold_delay=0,batch_fold_type=fold_type, act_bits=activ_bits, \
                                    add_config=quant_add_config, dummy_input = dummy_input  )

    # load features from quantizer_load into quantizer and model state
    for i, item in enumerate(quantizer_load.param_to_quantize):
        item2 = quantizer.param_to_quantize[i]
        item2['w_max_thr'] = item['w_max_thr']
        item2['bias_bits'] = item['bias_bits']
        if 'w_min_thr' in item2.keys():
            item2['w_min_thr'] = item['w_min_thr']
        if item2['conv'] is not None:
            item2['conv'].load_state_dict(item['conv'].state_dict())
        if item2['batch_norm'] is not None:
            item2['batch_norm'].load_state_dict(
                item['batch_norm'].state_dict())
        if item2['act'] is not None:
            item2['act'].load_state_dict(item['act'].state_dict())

    # enable folding of batch norm in to convolutional layers
    quantizer.batch_fold = True

    # generate model (with cuda)
    quantizer.generate_deployment_model()
    quantizer.deployment_model.cuda()
    quantizer.deployment_model.eval()

    # adjust bias and scaling factor
    for i, item in enumerate(quantizer.param_to_quantize):
        # clamping bias of last layer
        if type(item['quant_conv']) is nn.Linear:
            bias_bits = item['bias_bits']
            if item['quant_conv'].bias is not None:
                item['quant_conv'].bias.data.clamp_(-2**(bias_bits - 1),
                                                    2**(bias_bits - 1) - 1)

        # clamping bias of other layers
        else:
            #mult bias rounding
            if item['quant_act'] is not None:
                M0 = item['quant_act'].M_ZERO
                N0 = item['quant_act'].N_ZERO
                M0_new = np.floor(
                    M0 * 2**(MULTBIAS_BITS - 1)) / 2**(MULTBIAS_BITS - 1)
                item['quant_act'].M_ZERO = M0_new
                item['quant_act'].N_ZERO = N0

            #bias clamping and extraction of normbias
            if item['quant_conv'].bias is not None:
                bias_bits = item['bias_bits']
                bias = item['quant_conv'].bias.data
                bias = bias.clamp_(-2**(bias_bits - 1), 2**(bias_bits - 1) - 1)
                N0_bias = -max(bias.abs().max().log2().ceil().item() - 7, 0)
                item['quant_conv'].bias.data = bias.mul(
                    2**N0_bias).round().clamp(-2**7,
                                              (2**7) - 1).div(2**N0_bias)
                item['quant_conv'].bias.N0_bias = N0_bias

    # run validation
    if args.evaluate:
        import torchvision.transforms as transforms

        normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5],
                                         std=[0.5, 0.5, 0.5])

        input_eval_transform = transforms.Compose([
            transforms.Scale(int(input_size)),
            transforms.ToTensor(), normalize
        ])
        input_transform = getattr(model, 'input_transform',
                                  input_eval_transform)

        val_data = get_dataset(dataset, 'val', input_transform['eval'])
        val_loader = torch.utils.data.DataLoader(val_data,
                                                 batch_size=32,
                                                 shuffle=False,
                                                 num_workers=8,
                                                 pin_memory=True)

        def forward(data_loader,
                    model,
                    epoch=0,
                    training=False,
                    quantizer=None):
            batch_time = AverageMeter()
            data_time = AverageMeter()
            losses = AverageMeter()
            top1 = AverageMeter()
            top5 = AverageMeter()

            end = time.time()
            model.eval()

            max_i = 1
            min_i = -1
            bit_i = 8
            n_steps = (2**bit_i) - 1
            eps = (max_i - min_i) / n_steps

            for i, (inputs, target) in enumerate(data_loader):
                # measure data loading time
                data_time.update(time.time() - end)
                target = target.cuda(async=True)
                input_var = Variable(inputs.cuda(), volatile=not training)
                target_var = Variable(target)
                input_var = input_var.clamp(min_i, max_i).div(eps).round()
                if quantizer is not None:
                    input_var = input_var.mul(eps)
                    quantizer.store_and_quantize(training=False)

                # compute output
                output = model(input_var)
                if type(output) is list:
                    output = output[0]
                values, indices = output.max(1)
                # measure accuracy and record loss
                prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
                top1.update(prec1.item(), inputs.size(0))
                top5.update(prec5.item(), inputs.size(0))

                # measure elapsed time
                batch_time.update(time.time() - end)
                end = time.time()

                if quantizer is not None:
                    quantizer.restore_real_value()

                if i % 100 == 0:
                    print('{phase} - Epoch: [{0}][{1}/{2}]\t'
                          'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                          'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                          'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                          'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                              epoch,
                              i,
                              len(data_loader),
                              phase='TRAINING' if training else 'EVALUATING',
                              batch_time=batch_time,
                              data_time=data_time,
                              top1=top1,
                              top5=top5))

            print('Top1: ', top1.avg)
            return top1.avg, top5.avg

        forward(val_loader, quantizer.deployment_model.cuda())

    # dumping of the input image
    input_dim = int(input_dim)
    if args.img_file == '':
        print('No input image provide. Going to generate a rondom matrix!')
        x = torch.Tensor(1, 3, int(input_dim),
                         int(input_dim)).cuda().random_(-4, 4)

    else:
        image = cv2.imread(args.img_file)
        scale_dim = 256
        off = int((scale_dim - input_dim) / 2)
        image = cv2.resize(image, (scale_dim, scale_dim))
        image = image[off:input_dim + off, off:off + input_dim, :]
        x = torch.Tensor(image).permute(2, 0, 1).unsqueeze(0).cuda().add(-128)

    y = quantizer.deployment_model(x)
    print('This is the inference result: ', y.argmax().item())

    #file generation
    if args.bin_path == '':
        root_folder = './'
    else:
        root_folder = args.bin_path
    folder_bindump = root_folder + 'binFiles/'

    if args.dump:
        os.makedirs(root_folder, exist_ok=True)
        os.makedirs(folder_bindump, exist_ok=True)

    def get_4D_act_tensor_to_list_HWC(x):
        tensor_list = []
        n_b, in_ch, in_dim_w, in_dim_h = x.size()
        for c in range(in_ch):
            for i in range(in_dim_w):
                for j in range(in_dim_h):
                    tensor_list.append(int(x[0][c][i][j].item()))
        return tensor_list

    def get_4D_wgt_tensor_to_list_HWC(w):
        tensor_list = []
        out_ch, in_ch, ker_w, ker_h = w.size()
        for co in range(out_ch):
            for ci in range(in_ch):
                for i in range(ker_w):
                    for j in range(ker_h):
                        tensor_list.append(int(w[co][ci][i][j].item()))
        return tensor_list

    def get_2D_wgt_tensor_to_list_HWC(w):
        tensor_list = []
        out_ch, in_ch = w.size()
        for co in range(out_ch):
            for ci in range(in_ch):
                tensor_list.append(int(w[co][ci].item()))
        return tensor_list

    def get_1D_bias_tensor_to_list_HWC(b):
        tensor_list = []
        out_ch, = b.size()
        for co in range(out_ch):
            tensor_list.append(int(b[co].item()))
        return tensor_list

    def dump_int16_tensor_to_file(file_name, data):
        newFile = open(file_name, "wb")
        for item in data:
            newFile.write(struct.pack('h', item))  # 1byte
        newFile.close()

    def dump_int8_tensor_to_file(file_name, data):
        newFile = open(file_name, "wb")
        for item in data:
            newFile.write(struct.pack('b', item))  # 1byte
        newFile.close()

    def print_size(model, input, output):
        global si, so
        si = input[0].size()
        so = output[0].size()

    # graph buffers
    Layers = []
    WeightEdges = []
    ActivEdges = []
    GraphNodes = []

    # dump input data
    input_tensor = "In"  # this must match the input tensor of the network inference function
    if args.dump:
        data = get_4D_act_tensor_to_list_HWC(x)
        dump_int8_tensor_to_file(folder_bindump + 'L0_INPUT.bin', data)
    txt = 'TCArgInfo("signed char *__restrict", "{}", ARG_SCOPE_ARG, ARG_DIR_IN, 1,AT_MEM_L3_HRAM, 0)'.format(
        input_tensor)
    WeightEdges.append(txt)

    # dump the network layer-by-layer
    i_l = 0
    for i, item in enumerate(quantizer.param_to_quantize):

        #convolution parameters
        conv = item['quant_conv']
        hook = conv.register_forward_hook(print_size)
        quantizer.deployment_model(x)
        hook.remove()
        input_size = si

        if item['quant_act'] is not None:
            M0 = int(item['quant_act'].M_ZERO * 2**(MULTBIAS_BITS - 1))
            N0 = MULTBIAS_BITS - 1 - item['quant_act'].N_ZERO

        if type(conv) in [
                models.linear_quantized_modules.Conv2d_SAME, nn.Conv2d
        ]:
            out_ch = conv.out_channels
            in_ch = conv.in_channels
            if in_ch == conv.groups:
                is_dw = True
            else:
                is_dw = False
            ker_size = conv.kernel_size
            ker_stride = conv.stride
            ker_dilation = conv.dilation
            if is_dw:
                num_params = out_ch * ker_size[0] * ker_size[0]
            else:
                num_params = out_ch * in_ch * ker_size[0] * ker_size[0]
            ker_padding = conv.padding

            #weight parameters
            file_txt = 'L{}_weight_L3.bin'.format(i_l)
            if args.dump:
                data = get_4D_wgt_tensor_to_list_HWC(conv.weight.data)
                dump_int8_tensor_to_file(folder_bindump + file_txt, data)
            txt = 'TCArgInfo ("signed char *__restrict", "FL{}",  ARG_SCOPE_GLOBAL, ARG_DIR_CONSTIN, 0,  AT_MEM_UNDEF, ConstInfo("{}", 1, 1, 1, 0))'.format( \
                i_l,folder_bindump+file_txt)
            WeightEdges.append(txt)

            #bias parameters
            N0_bias = int(-conv.bias.N0_bias)
            bias_int = conv.bias.div(2**N0_bias).round().clamp(
                -2**7, (2**7) - 1)
            file_txt = 'L{}_bias_L3.bin'.format(i_l)
            if args.dump:
                data = get_1D_bias_tensor_to_list_HWC(bias_int)
                dump_int8_tensor_to_file(folder_bindump + file_txt, data)
            txt = 'TCArgInfo ( "signed char * __restrict__", "BL{}",  ARG_SCOPE_GLOBAL, ARG_DIR_CONSTIN, 0,  AT_MEM_UNDEF, ConstInfo(  "{}", 1, 1, 1, 0))'\
                .format(i_l,folder_bindump+file_txt)
            WeightEdges.append(txt)

            # scaling factor parameters
            file_txt = 'L{}_M0_L3.bin'.format(i_l)
            if args.dump:
                data = [M0 for x in range(out_ch)]
                dump_int8_tensor_to_file(folder_bindump + file_txt, data)
            txt = 'TCArgInfo ("signed char *__restrict", "ML{}",  ARG_SCOPE_GLOBAL, ARG_DIR_CONSTIN, 0,  AT_MEM_UNDEF, ConstInfo("{}", 1, 1, 1, 0))'.format( \
                i_l,folder_bindump+file_txt)
            WeightEdges.append(txt)

            # add node to the graph
            NormMul = 7  # fixed
            Norm = N0 - NormMul
            NormBias = N0_bias
            #if ker_size[0]==1 and ker_size[1]==1:
            #
            #else:
            #	NormBias = 2*Norm - N0_bias

            # convolution layer
            txt = 'CNN_ConvolutionMulBiasPoolReLU("Layer{}", &CtrlH, 1,1,1,1,1,{},{},{},{},{},1,1,1,0,1,{},{},{},{},{},{},{},{},{},{},{}, 1, KOP_NONE, 3,3, 1,1, 2,2, 1, KOP_RELU)'\
                .format(i_l,0,0,-NormBias,N0,0,\
                  in_ch,out_ch,input_size[2],input_size[3],'KOP_CONV_DWDP' if is_dw else 'KOP_CONV_DP', \
                       ker_size[0], ker_size[1], ker_dilation[0], ker_dilation[1], ker_stride[0], ker_stride[1] )
            Layers.append(txt)

            # temporary tensors
            output_tensor = "OutL{}".format(i)
            txt = 'TCArgInfo ("signed char *__restrict", "{}", ARG_SCOPE_LOCAL,  ARG_DIR_INOUT, 0, AT_MEM_UNDEF, 0)'\
                .format(output_tensor)
            ActivEdges.append(txt)


            txt = 'AddNode("Layer{}",Bindings(5,GNodeArg(GNA_IN, "{}", 0),GNodeArg(GNA_IN, "FL{}", 0),GNodeArg(GNA_IN, "BL{}", 0),GNodeArg(GNA_IN, "ML{}", 0),GNodeArg(GNA_OUT, "{}", 0) ))'\
                .format(i_l,input_tensor,i_l,i_l,i_l,output_tensor)
            GraphNodes.append(txt)

            #
            input_tensor = output_tensor
            i_l += 1

        elif type(conv) in [nn.Linear]:

            out_features = conv.out_features
            in_features = conv.in_features

            file_txt = 'L{}_weight_L3.bin'.format(i_l)
            data = get_2D_wgt_tensor_to_list_HWC(conv.weight.data)
            if args.dump:
                dump_int8_tensor_to_file(folder_bindump + file_txt, data)
            txt = 'TCArgInfo ("signed char *__restrict", "FL{}",  ARG_SCOPE_GLOBAL, ARG_DIR_CONSTIN, 0,  AT_MEM_UNDEF, ConstInfo("{}", 1, 1, 1, 0))'.format( \
                i_l,folder_bindump+file_txt)
            WeightEdges.append(txt)

            data = get_1D_bias_tensor_to_list_HWC(conv.bias.data)
            if args.dump:
                dump_int16_tensor_to_file(folder_bindump + file_txt, data)
            txt = 'TCArgInfo ( "short int * __restrict__", "BL{}",  ARG_SCOPE_GLOBAL, ARG_DIR_CONSTIN, 0,  AT_MEM_UNDEF, ConstInfo(  "{}", 1, 1, 1, 0))'\
                .format(i_l,folder_bindump+file_txt)
            WeightEdges.append(txt)

            txt = 'CNN_LinearReLU("Layer{}",&CtrlH, 1,1,2,2,0,0,0,0,1,1,1,1,{},{},KOP_LINEAR, KOP_NONE)'\
                .format(i_l,in_features,out_features )
            Layers.append(txt)

            output_tensor = "Out"
            txt = 'TCArgInfoA("short int *__restrict", "{}",     ARG_SCOPE_ARG,   ARG_DIR_OUT,    AT_MEM_L2,  AT_MEM_L2, 0)'\
                .format(output_tensor)
            WeightEdges.append(txt)

            txt = 'AddNode("Layer{}",Bindings(4,GNodeArg(GNA_IN, "{}", 0),GNodeArg(GNA_IN, "FL{}", 0),GNodeArg(GNA_IN, "BL{}", 0),GNodeArg(GNA_OUT, "{}", 0)))'\
                .format(i_l,input_tensor,i_l,i_l,output_tensor)
            GraphNodes.append(txt)

            input_tensor = output_tensor
            i_l += 1

        # pooling layer append to the last convolution layer
        if item['pool'] is not None:
            pool = item['pool'][0]
            if type(pool) is nn.AvgPool2d:
                txt = 'CNN_PoolReLU("Layer{}",&CtrlH, 1,1,0,0, 1,1, {},{}, {},{},KOP_AVGPOOL,{},{}, 1,1,{},{}, 1, KOP_NONE)'\
                    .format(i_l,input_size[1],input_size[1],input_size[2],input_size[3],pool.kernel_size, pool.kernel_size,\
                           pool.stride, pool.stride)
                Layers.append(txt)

                output_tensor = "OutL{}".format(i_l)
                txt = 'TCArgInfo ("signed char *__restrict", "{}", ARG_SCOPE_LOCAL,  ARG_DIR_INOUT, 0, AT_MEM_UNDEF, 0)'\
                    .format(output_tensor)
                ActivEdges.append(txt)

                txt = 'AddNode("Layer{}",Bindings(2,GNodeArg(GNA_IN, "{}", 0),GNodeArg(GNA_OUT, "{}", 0)))'\
                    .format(i_l,input_tensor,output_tensor)
                GraphNodes.append(txt)

                input_tensor = output_tensor
                i_l += 1

    # print Graph Model
    f_txt = ''
    f_txt += 'void {}()'.format(args.network_name) + '{\n'
    f_txt += '\tCNN_GenControl_T CtrlH;\n'
    f_txt += '\tCNN_InitGenCtrl(&CtrlH);\n'
    f_txt += '\tCNN_SetGenCtrl(&CtrlH, "EnableIm2Col", AT_OPT_ON);\n'
    f_txt += '\tCNN_SetGenCtrl(&CtrlH, "PADTYPE", PAD_BALANCED_RIGHT);\n\n'
    for item in Layers:
        f_txt += '\t' + item + ';\n'

    f_txt += '\n\tCreateGraph("{}",\n'.format(args.network_name)
    f_txt += '\t\tCArgs({},\n'.format(len(WeightEdges))
    for i, item in enumerate(WeightEdges):
        f_txt += '\t\t\t' + item
        f_txt += '\n' if i == len(WeightEdges) - 1 else ',\n'

    f_txt += '\t\t),\n\t\tCArgs({},\n'.format(len(ActivEdges))
    for i, item in enumerate(ActivEdges):
        f_txt += '\t\t\t' + item
        f_txt += '\n' if i == len(ActivEdges) - 1 else ',\n'
    f_txt += '\t\t)\n\t);\n'

    for i, item in enumerate(GraphNodes):
        f_txt += '\t' + item + ';\n'

    f_txt += '\tCloseGraph();\n}'

    #if args.dump:
    file_name = args.network_name + '.c'
    newFile = open(file_name, "w")
    newFile.write(f_txt)
    newFile.close()
Exemple #3
0
def main():
    global args, best_prec1
    best_prec1 = 0
    args = parser.parse_args()

    weight_bits = int(args.weight_bits)
    activ_bits = int(args.activ_bits)

    if args.save is '':
        args.save = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    save_path = os.path.join(args.results_dir, args.save)
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    setup_logging(os.path.join(save_path, 'log.txt'))
    results_file = os.path.join(save_path, 'results.%s')
    results = ResultsLog(results_file % 'csv', results_file % 'html')

    logging.info("saving to %s", save_path)
    logging.debug("run arguments: %s", args)

    if 'cuda' in args.type:
        args.gpus = [int(i) for i in args.gpus.split(',')]
        print('Selected GPUs: ', args.gpus)
        torch.cuda.set_device(args.gpus[0])
        cudnn.benchmark = True
    else:
        args.gpus = None

    # create model
    logging.info("creating model %s", args.model)
    model = models.__dict__[args.model]
    nClasses = get_num_classes(args.dataset)
    model_config = {'input_size': args.input_size, 'dataset': args.dataset, 'num_classes': nClasses, \
                    'type_quant': args.type_quant, 'weight_bits': weight_bits, 'activ_bits': activ_bits,\
                    'activ_type': args.activ_type, 'width_mult': float(args.mobilenet_width), 'input_dim': float(args.mobilenet_input) }

    if args.model_config is not '':
        model_config = dict(model_config, **literal_eval(args.model_config))

    model = model(**model_config)
    logging.info("created model with configuration: %s", model_config)
    print(model)

    num_parameters = sum([l.nelement() for l in model.parameters()])
    logging.info("number of parameters: %d", num_parameters)

    # Data loading code
    default_transform = {
        'train':
        get_transform(args.dataset, input_size=args.input_size, augment=True),
        'eval':
        get_transform(args.dataset, input_size=args.input_size, augment=False)
    }
    transform = getattr(model, 'input_transform', default_transform)
    regime = getattr(
        model, 'regime', {
            0: {
                'optimizer': args.optimizer,
                'lr': args.lr,
                'momentum': args.momentum,
                'weight_decay': args.weight_decay
            }
        })
    print(transform)
    # define loss function (criterion) and optimizer
    criterion = getattr(model, 'criterion', nn.CrossEntropyLoss)()
    criterion.type(args.type)

    val_data = get_dataset(args.dataset, 'val', transform['eval'])
    val_loader = torch.utils.data.DataLoader(val_data,
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=True)

    if args.quantizer:
        val_quant_loader = torch.utils.data.DataLoader(
            val_data,
            batch_size=32,
            shuffle=False,
            num_workers=args.workers,
            pin_memory=True)

    train_data = get_dataset(args.dataset, 'train', transform['train'])
    train_loader = torch.utils.data.DataLoader(train_data,
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               num_workers=args.workers,
                                               pin_memory=True)

    #define optimizer
    params_dict = dict(model.named_parameters())
    params = []
    for key, value in params_dict.items():
        if 'clip_val' in key:
            params += [{'params': value, 'weight_decay': 1e-4}]
        else:
            params += [{'params': value}]
    optimizer = torch.optim.SGD(params, lr=0.1)
    logging.info('training regime: %s', regime)

    #define quantizer
    if args.quantizer:
        if args.mem_constraint is not '':
            mem_contraints = json.loads(args.mem_constraint)
            print('This is the memory constraint:', mem_contraints)
            if mem_contraints is not None:
                x_test = torch.Tensor(1, 3, args.mobilenet_input,
                                      args.mobilenet_input)
                add_config = memory_driven_quant(model, x_test,
                                                 mem_contraints[0],
                                                 mem_contraints[1],
                                                 args.mixed_prec_quant)
                if add_config == -1:
                    print('The quantization process failed!')
            else:
                add_config = []
        else:
            mem_constraint = None
            if args.quant_add_config is not '':
                add_config = json.loads(args.quant_add_config)

            else:
                add_config = []

        quantizer = quantization.QuantOp(model, args.type_quant, weight_bits, \
            batch_fold_type=args.batch_fold_type, batch_fold_delay=args.batch_fold_delay, act_bits=activ_bits, \
            add_config = add_config )
        quantizer.deployment_model.type(args.type)
        quantizer.add_params_to_optimizer(optimizer)

    else:
        quantizer = None

    #exit(0)

    #multi gpus
    if args.gpus and len(args.gpus) > 1:
        model = torch.nn.DataParallel(model).cuda()
    else:
        model.type(args.type)

    if args.resume:
        checkpoint_file = args.resume
        if os.path.isdir(checkpoint_file):
            checkpoint_file = os.path.join(checkpoint_file,
                                           'model_best.pth.tar')
        if os.path.isfile(checkpoint_file):
            logging.info("loading checkpoint '%s'", args.resume)
            checkpoint_loaded = torch.load(checkpoint_file)
            checkpoint = checkpoint_loaded['state_dict']
            model.load_state_dict(checkpoint, strict=False)
            print('Model pretrained')
        else:
            logging.error("no checkpoint found at '%s'", args.resume)

    if args.quantizer:
        quantizer.init_parameters()

    if args.evaluate:
        # evaluate on validation set

        if args.quantizer:
            # evaluate deployment model on validation set
            quantizer.generate_deployment_model()
            val_quant_loss, val_quant_prec1, val_quant_prec5 = validate(
                val_quant_loader, quantizer.deployment_model, criterion, 0,
                'deployment')
        else:
            val_quant_loss, val_quant_prec1, val_quant_prec5 = 0, 0, 0

        val_loss, val_prec1, val_prec5 = validate(val_loader, model, criterion,
                                                  0, quantizer)

        logging.info('\n This is the results from evaluation only: '
                     'Validation Prec@1 {val_prec1:.3f} \t'
                     'Validation Prec@5 {val_prec5:.3f} \t'
                     'Validation Quant Prec@1 {val_quant_prec1:.3f} \t'
                     'Validation Quant Prec@5 {val_quant_prec5:.3f} \n'.format(
                         val_prec1=val_prec1,
                         val_prec5=val_prec5,
                         val_quant_prec1=val_quant_prec1,
                         val_quant_prec5=val_quant_prec5))
        exit(0)

    for epoch in range(args.start_epoch, args.epochs):
        optimizer = adjust_optimizer(optimizer, epoch, regime)

        # train for one epoch
        train_loss, train_prec1, train_prec5 = train(train_loader, model,
                                                     criterion, epoch,
                                                     optimizer, quantizer)

        # evaluate on validation set
        val_loss, val_prec1, val_prec5 = validate(val_loader, model, criterion,
                                                  epoch, quantizer)

        if args.quantizer:
            # evaluate deployment model on validation set
            quantizer.generate_deployment_model()
            val_quant_loss, val_quant_prec1, val_quant_prec5 = validate(
                val_quant_loader, quantizer.deployment_model, criterion, epoch,
                'deployment')
        else:
            val_quant_loss, val_quant_prec1, val_quant_prec5 = 0, 0, 0

        # remember best prec@1 and save checkpoint
        is_best = val_prec1 > best_prec1
        best_prec1 = max(val_prec1, best_prec1)

        #save_model
        if args.save_check:

            print('Saving Model!! Accuracy : ', best_prec1)
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'model': args.model,
                    'config': model_config,
                    'state_dict': model.state_dict(),
                    'best_prec1': best_prec1,
                    'regime': regime,
                    'quantizer': quantizer,
                    'add_config': add_config,
                    'fold_type': args.batch_fold_type
                },
                is_best,
                path=save_path)

        logging.info('\n Epoch: {0}\t'
                     'Training Loss {train_loss:.4f} \t'
                     'Training Prec@1 {train_prec1:.3f} \t'
                     'Training Prec@5 {train_prec5:.3f} \t'
                     'Validation Loss {val_loss:.4f} \t'
                     'Validation Prec@1 {val_prec1:.3f} \t'
                     'Validation Prec@5 {val_prec5:.3f} \t'
                     'Validation Quant Prec@1 {val_quant_prec1:.3f} \t'
                     'Validation Quant Prec@5 {val_quant_prec5:.3f} \n'.format(
                         epoch + 1,
                         train_loss=train_loss,
                         val_loss=val_loss,
                         train_prec1=train_prec1,
                         val_prec1=val_prec1,
                         train_prec5=train_prec5,
                         val_prec5=val_prec5,
                         val_quant_prec1=val_quant_prec1,
                         val_quant_prec5=val_quant_prec5))

        results.add(epoch=epoch + 1,
                    train_loss=train_loss,
                    val_loss=val_loss,
                    train_error1=100 - train_prec1,
                    val_error1=100 - val_prec1,
                    train_error5=100 - train_prec5,
                    val_error5=100 - val_prec5,
                    val_quant_error1=100 - val_quant_prec1,
                    val_quant_error5=100 - val_quant_prec5)
        results.save()
def main():
    global args, best_prec1
    best_prec1 = 0
    args = parser.parse_args()

    weight_bits = int(args.weight_bits)
    activ_bits = int(args.activ_bits)

    if args.save is '':
        args.save = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    save_path = os.path.join(args.results_dir, args.save)
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    setup_logging(os.path.join(save_path, 'log.txt'))
    results_file = os.path.join(save_path, 'results.%s')
    results = ResultsLog(results_file % 'csv', results_file % 'html')

    logging.info("saving to %s", save_path)
    logging.debug("run arguments: %s", args)

    writer = SummaryWriter()

    if 'cuda' in args.type:
        args.gpus = [int(i) for i in args.gpus.split(',')]
        print('Selected GPUs: ', args.gpus)
        # torch.cuda.set_device(args.gpus[0])
        cudnn.benchmark = True
    else:
        args.gpus = None

    # create model
    logging.info("creating model %s", args.model)
    if args.model == 'mobilenet':
        model = models.__dict__[args.model]
        model = model(**model_config)
    elif args.model == 'mobilenetv2':
        model = torch.hub.load('pytorch/vision:v0.6.0',
                               'mobilenet_v2',
                               pretrained=True)
    elif args.model == 'resnet18':
        model = torch.hub.load('pytorch/vision:v0.6.0',
                               'resnet18',
                               pretrained=True)
    else:  #if args.model == 'mobilenet_v3':
        model = models.mobilenetv3_large(
            width_mult=float(args.mobilenet_width))
        model.load_state_dict(
            torch.load(
                "models/mobilenet_v3/mobilenetv3-large-0.75-9632d2a8.pth"))
    nClasses = get_num_classes(args.dataset)
    model_config = {'input_size': args.input_size, 'dataset': args.dataset, 'num_classes': nClasses, \
                    'width_mult': float(args.mobilenet_width), 'input_dim': float(args.mobilenet_input) }

    if args.model_config is not '':
        model_config = dict(model_config, **literal_eval(args.model_config))

    logging.info("created model with configuration: %s", model_config)
    print(model)

    num_parameters = sum([l.nelement() for l in model.parameters()])
    logging.info("number of parameters: %d", num_parameters)

    # Data loading code
    default_transform = {
        'train':
        get_transform(args.dataset, input_size=args.input_size, augment=True),
        'eval':
        get_transform(args.dataset, input_size=args.input_size, augment=False)
    }
    transform = getattr(model, 'input_transform', default_transform)
    regime = getattr(
        model, 'regime', {
            0: {
                'optimizer': args.optimizer,
                'lr': args.lr,
                'momentum': args.momentum,
                'weight_decay': args.weight_decay
            }
        })
    print(transform)
    # define loss function (criterion) and optimizer
    criterion = getattr(model, 'criterion', nn.CrossEntropyLoss)()
    criterion.type(args.type)

    val_data = get_dataset(args.dataset, 'val', transform['eval'])
    val_loader = torch.utils.data.DataLoader(val_data,
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=True)

    fast_val_loader = torch.utils.data.DataLoader(
        val_data,
        batch_size=args.batch_size,
        num_workers=args.workers,
        pin_memory=True,
        sampler=torch.utils.data.RandomSampler(val_data,
                                               replacement=True,
                                               num_samples=1000))

    train_data = get_dataset(args.dataset, 'train', transform['train'])
    train_loader = torch.utils.data.DataLoader(train_data,
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               num_workers=args.workers,
                                               pin_memory=True)
    fast_train_loader = torch.utils.data.DataLoader(
        train_data,
        batch_size=args.batch_size,
        num_workers=args.workers,
        pin_memory=True,
        sampler=torch.utils.data.RandomSampler(val_data,
                                               replacement=True,
                                               num_samples=100000))

    #define optimizer
    params_dict = dict(model.named_parameters())
    params = []
    for key, value in params_dict.items():
        if 'alpha' in key or 'beta' in key:
            params += [{'params': value, 'weight_decay': 1e-4}]
        else:
            params += [{'params': value, 'weight_decay': 1e-5}]

    mixed_prec_dict = None
    if args.mixed_prec_dict is not None:
        mixed_prec_dict = nemo.utils.precision_dict_from_json(
            args.mixed_prec_dict)
        print("Load mixed precision dict from outside")
    elif args.mem_constraint is not '':
        mem_contraints = json.loads(args.mem_constraint)
        print('This is the memory constraint:', mem_contraints)
        if mem_contraints is not None:
            x_test = torch.Tensor(1, 3, 224, 224)
            mixed_prec_dict = memory_driven_quant(model,
                                                  x_test,
                                                  mem_contraints[0],
                                                  mem_contraints[1],
                                                  args.mixed_prec_quant,
                                                  use_sawb=args.use_sawb)

    #multi gpus
    if torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model).cuda()
    else:
        model = model.cuda()

    # mobilenet_width = float(args.mobilenet_width)
    # mobilenet_width_s = args.mobilenet_width
    # mobilenet_input = int(args.mobilenet_input)

    if args.resume is None:
        val_loss, val_prec1, val_prec5 = validate(val_loader, model, criterion,
                                                  0, None)
        print("[NEMO] Full-precision model: top-1=%.2f top-5=%.2f" %
              (val_prec1, val_prec5))

    if args.quantize:

        # transform the model in a NEMO FakeQuantized representation
        model = nemo.transform.quantize_pact(model,
                                             dummy_input=torch.randn(
                                                 (1, 3, 224, 224)).to('cuda'))

        if args.resume is not None:
            checkpoint_file = args.resume
            if os.path.isfile(checkpoint_file):
                logging.info("loading checkpoint '%s'", args.resume)
                checkpoint_loaded = torch.load(checkpoint_file)
                checkpoint = checkpoint_loaded['state_dict']
                model.load_state_dict(checkpoint, strict=True)
                prec_dict = checkpoint_loaded.get('precision')
            else:
                logging.error("no checkpoint found at '%s'", args.resume)
                import sys
                sys.exit(1)

        if args.resume is None:
            print("[NEMO] Model calibration")
            model.change_precision(bits=20)
            model.reset_alpha_weights()

            if args.initial_folding:
                model.fold_bn()
                # use DFQ for weight equalization
                if args.initial_equalization:
                    model.equalize_weights_dfq()
            elif args.initial_equalization:
                model.equalize_weights_lsq(verbose=True)
                model.reset_alpha_weights()


#                model.reset_alpha_weights(use_method='dyn_range', dyn_range_cutoff=0.05, verbose=True)

# calibrate after equalization
            with model.statistics_act():
                val_loss, val_prec1, val_prec5 = validate(
                    val_loader, model, criterion, 0, None)
            model.reset_alpha_act()

            val_loss, val_prec1, val_prec5 = validate(val_loader, model,
                                                      criterion, 0, None)

            print("[NEMO] 20-bit calibrated model: top-1=%.2f top-5=%.2f" %
                  (val_prec1, val_prec5))
            nemo.utils.save_checkpoint(model,
                                       None,
                                       0,
                                       acc=val_prec1,
                                       checkpoint_name='resnet18_calibrated',
                                       checkpoint_suffix=args.suffix)

            model.change_precision(bits=activ_bits)
            model.change_precision(bits=weight_bits, scale_activations=False)

            # init weight clipping parameters to their reset value and disable their gradient
            model.reset_alpha_weights()
            if args.use_sawb:
                model.disable_grad_sawb()
                model.weight_clip_sawb()

            mixed_prec_dict_all = model.export_precision()
            mixed_prec_dict_all['relu']['x_bits'] = 2
            mixed_prec_dict_all['layer1.0.relu']['x_bits'] = 4
            mixed_prec_dict_all['layer3.1.conv1']['W_bits'] = 4
            mixed_prec_dict_all['layer3.1.conv2']['W_bits'] = 4
            mixed_prec_dict_all['layer4.0.conv1']['W_bits'] = 2
            mixed_prec_dict_all['layer4.0.conv2']['W_bits'] = 2
            mixed_prec_dict_all['layer4.1.conv1']['W_bits'] = 2
            mixed_prec_dict_all['layer4.1.conv2']['W_bits'] = 2
            model.change_precision(bits=1, min_prec_dict=mixed_prec_dict_all)

        else:
            print("[NEMO] Not calibrating model, as it is pretrained")
            model.change_precision(bits=1, min_prec_dict=prec_dict)

    optimizer = torch.optim.Adam([
        {
            'params': model.get_nonclip_parameters(),
            'lr': args.lr,
            'weight_decay': 1e-5
        },
        {
            'params': model.get_clip_parameters(),
            'lr': args.lr,
            'weight_decay': 0.001
        },
    ])

    reset_grad_flow(model, __global_ave_grads, __global_max_grads)
    for epoch in range(args.start_epoch, args.epochs):
        #        optimizer = adjust_optimizer(optimizer, epoch, regime)

        # train for one epoch
        train_loss, train_prec1, train_prec5 = train(
            train_loader,
            model,
            criterion,
            epoch,
            optimizer,
            freeze_bn=True if epoch > 0 else False,
            absorb_bn=True if epoch == 0 else False,
            writer=writer)
        val_loss, val_prec1, val_prec5 = validate(val_loader, model, criterion,
                                                  epoch)

        writer.add_scalar('Loss/val', val_loss, epoch * len(train_loader))
        writer.add_scalar('Accuracy/val', val_prec1, epoch * len(train_loader))

        # remember best prec@1 and save checkpoint
        is_best = val_prec1 > best_prec1
        best_prec1 = max(val_prec1, best_prec1)

        #save_model
        if args.save_check:
            nemo.utils.save_checkpoint(
                model,
                optimizer,
                0,
                acc=val_prec1,
                checkpoint_name='resnet18%s_checkpoint' %
                ("_mixed" if mixed_prec_dict is not None else ""),
                checkpoint_suffix=args.suffix)

        if is_best:
            nemo.utils.save_checkpoint(
                model,
                optimizer,
                0,
                acc=val_prec1,
                checkpoint_name='resnet18%s_best' %
                ("_mixed" if mixed_prec_dict is not None else ""),
                checkpoint_suffix=args.suffix)

        logging.info('\n Epoch: {0}\t'
                     'Training Loss {train_loss:.4f} \t'
                     'Training Prec@1 {train_prec1:.3f} \t'
                     'Training Prec@5 {train_prec5:.3f} \t'
                     'Validation Loss {val_loss:.4f} \t'
                     'Validation Prec@1 {val_prec1:.3f} \t'
                     'Validation Prec@5 {val_prec5:.3f} \t'.format(
                         epoch + 1,
                         train_loss=train_loss,
                         val_loss=val_loss,
                         train_prec1=train_prec1,
                         val_prec1=val_prec1,
                         train_prec5=train_prec5,
                         val_prec5=val_prec5))

        results.add(epoch=epoch + 1,
                    train_loss=train_loss,
                    val_loss=val_loss,
                    train_error1=100 - train_prec1,
                    val_error1=100 - val_prec1,
                    train_error5=100 - train_prec5,
                    val_error5=100 - val_prec5)
        results.save()
def main():
    global args, best_prec1
    best_prec1 = 0
    args = parser.parse_args()

    weight_bits = int(args.weight_bits)
    activ_bits = int(args.activ_bits)

    if args.save is '':
        args.save = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    save_path = os.path.join(args.results_dir, args.save)
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    setup_logging(os.path.join(save_path, 'log.txt'))
    results_file = os.path.join(save_path, 'results.%s')
    results = ResultsLog(results_file % 'csv', results_file % 'html')

    logging.info("saving to %s", save_path)
    logging.debug("run arguments: %s", args)

    if 'cuda' in args.type:
        args.gpus = [int(i) for i in args.gpus.split(',')]
        print('Selected GPUs: ', args.gpus)
        torch.cuda.set_device(args.gpus[0])
        cudnn.benchmark = True
    else:
        args.gpus = None

    # create model
    logging.info("creating model %s", args.model)
    if args.model == 'mobilenet':
        model = models.__dict__[args.model]
    elif args.model == 'mobilenetv2':
        model = torch.hub.load('pytorch/vision:v0.6.0',
                               'mobilenet_v2',
                               pretrained=True)
    else:  #if args.model == 'mobilenet_v3':
        model = models.mobilenetv3_large(
            width_mult=float(args.mobilenet_width))
        model.load_state_dict(
            torch.load(
                "models/mobilenet_v3/mobilenetv3-large-0.75-9632d2a8.pth"))
    nClasses = get_num_classes(args.dataset)
    model_config = {'input_size': args.input_size, 'dataset': args.dataset, 'num_classes': nClasses, \
                    'width_mult': float(args.mobilenet_width), 'input_dim': float(args.mobilenet_input) }

    if args.model_config is not '':
        model_config = dict(model_config, **literal_eval(args.model_config))

    model = model(**model_config)
    logging.info("created model with configuration: %s", model_config)
    print(model)

    num_parameters = sum([l.nelement() for l in model.parameters()])
    logging.info("number of parameters: %d", num_parameters)

    # Data loading code
    default_transform = {
        'train':
        get_transform(args.dataset, input_size=args.input_size, augment=True),
        'eval':
        get_transform(args.dataset, input_size=args.input_size, augment=False)
    }
    transform = getattr(model, 'input_transform', default_transform)
    regime = getattr(
        model, 'regime', {
            0: {
                'optimizer': args.optimizer,
                'lr': args.lr,
                'momentum': args.momentum,
                'weight_decay': args.weight_decay
            }
        })
    print(transform)
    # define loss function (criterion) and optimizer
    criterion = getattr(model, 'criterion', nn.CrossEntropyLoss)()
    criterion.type(args.type)

    val_data = get_dataset(args.dataset, 'val', transform['eval'])
    val_loader = torch.utils.data.DataLoader(val_data,
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=True)

    train_data = get_dataset(args.dataset, 'train', transform['train'])
    train_loader = torch.utils.data.DataLoader(train_data,
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               num_workers=args.workers,
                                               pin_memory=True)

    #define optimizer
    params_dict = dict(model.named_parameters())
    params = []
    for key, value in params_dict.items():
        if 'alpha' in key or 'beta' in key:
            params += [{'params': value, 'weight_decay': 1e-4}]
        else:
            params += [{'params': value, 'weight_decay': 1e-5}]

    mixed_prec_dict = None
    if args.mixed_prec_dict is not None:
        mixed_prec_dict = nemo.utils.precision_dict_from_json(
            args.mixed_prec_dict)
        print("Load mixed precision dict from outside")
    elif args.mem_constraint is not '':
        mem_contraints = json.loads(args.mem_constraint)
        print('This is the memory constraint:', mem_contraints)
        if mem_contraints is not None:
            x_test = torch.Tensor(1, 3, args.mobilenet_input,
                                  args.mobilenet_input)
            mixed_prec_dict = memory_driven_quant(model, x_test,
                                                  mem_contraints[0],
                                                  mem_contraints[1],
                                                  args.mixed_prec_quant)

    #multi gpus
    if args.gpus and len(args.gpus) > 1:
        model = torch.nn.DataParallel(model).cuda()
    else:
        model.type(args.type)

    mobilenet_width = float(args.mobilenet_width)
    mobilenet_width_s = args.mobilenet_width
    mobilenet_input = int(args.mobilenet_input)

    if args.resume is None:
        val_loss, val_prec1, val_prec5 = validate(val_loader, model, criterion,
                                                  0, None)
        print("[NEMO] Full-precision model: top-1=%.2f top-5=%.2f" %
              (val_prec1, val_prec5))

    if args.quantize:

        # transform the model in a NEMO FakeQuantized representation
        model = nemo.transform.quantize_pact(model,
                                             dummy_input=torch.randn(
                                                 (1, 3, mobilenet_input,
                                                  mobilenet_input)).to('cuda'))
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=args.lr,
                                     weight_decay=1e-5)

        if args.resume is not None:
            checkpoint_file = args.resume
            if os.path.isfile(checkpoint_file):
                logging.info("loading checkpoint '%s'", args.resume)
                checkpoint_loaded = torch.load(checkpoint_file)
                checkpoint = checkpoint_loaded['state_dict']
                model.load_state_dict(checkpoint, strict=True)
                prec_dict = checkpoint_loaded.get('precision')
            else:
                logging.error("no checkpoint found at '%s'", args.resume)
                import sys
                sys.exit(1)

        if args.resume is None:
            print("[NEMO] Model calibration")
            model.change_precision(bits=20)
            model.reset_alpha_weights()

            if args.initial_folding:
                model.fold_bn()
                # use DFQ for weight equalization
                if args.initial_equalization:
                    model.equalize_weights_dfq()
            elif args.initial_equalization:
                model.equalize_weights_lsq(verbose=True)
                model.reset_alpha_weights()
#                model.reset_alpha_weights(use_method='dyn_range', dyn_range_cutoff=0.05, verbose=True)

# calibrate after equalization
            with model.statistics_act():
                val_loss, val_prec1, val_prec5 = validate(
                    val_loader, model, criterion, 0, None)

            # # use this in place of the usual calibration, because PACT_Act's descend from ReLU6 and
            # # the trained weights already assume the presence of a clipping effect
            # # this should be integrated in NEMO by saving the "origin" of the PACT_Act!
            # for i in range(0,27):
            #     model.model[i][3].alpha.data[:] = min(model.model[i][3].alpha.item(), model.model[i][3].max)

            val_loss, val_prec1, val_prec5 = validate(val_loader, model,
                                                      criterion, 0, None)

            print("[NEMO] 20-bit calibrated model: top-1=%.2f top-5=%.2f" %
                  (val_prec1, val_prec5))
            nemo.utils.save_checkpoint(
                model,
                optimizer,
                0,
                acc=val_prec1,
                checkpoint_name='mobilenet_%s_%d_calibrated' %
                (mobilenet_width_s, mobilenet_input),
                checkpoint_suffix=args.suffix)

            model.change_precision(bits=activ_bits)
            model.change_precision(bits=weight_bits, scale_activations=False)
            import IPython
            IPython.embed()

        else:
            print("[NEMO] Not calibrating model, as it is pretrained")
            model.change_precision(bits=1, min_prec_dict=prec_dict)

            ### val_loss, val_prec1, val_prec5 = validate(val_loader, model, criterion, 0, None)
            ### print("[NEMO] pretrained model: top-1=%.2f top-5=%.2f" % (val_prec1, val_prec5))

        if mixed_prec_dict is not None:
            mixed_prec_dict_all = model.export_precision()
            for k in mixed_prec_dict.keys():
                mixed_prec_dict_all[k] = mixed_prec_dict[k]
            model.change_precision(bits=1, min_prec_dict=mixed_prec_dict_all)

            # freeze and quantize BN parameters
            # nemo.transform.bn_quantizer(model, precision=nemo.precision.Precision(bits=20))
            # model.freeze_bn()
            # model.fold_bn()
            # model.equalize_weights_dfq(verbose=True)
            val_loss, val_prec1, val_prec5 = validate(val_loader, model,
                                                      criterion, 0, None)


#            print("[NEMO] Rounding weights")
#            model.round_weights()

    if args.pure_export:
        model.freeze_bn(reset_stats=True, disable_grad=True)
        val_loss, val_prec1, val_prec5 = validate(val_loader,
                                                  model,
                                                  criterion,
                                                  0,
                                                  None,
                                                  shorten=10)
        print("[NEMO] FQ model: top-1=%.2f top-5=%.2f" %
              (val_prec1, val_prec5))
        input_bias_dict = {'model.0.0': +1.0, 'model.0.1': +1.0}
        remove_bias_dict = {'model.0.1': 'model.0.2'}
        input_bias = math.floor(1.0 / (2. / 255)) * (2. / 255)
        model.qd_stage(eps_in=2. / 255,
                       add_input_bias_dict=input_bias_dict,
                       remove_bias_dict=remove_bias_dict,
                       int_accurate=True)
        model.model[0][0].value = input_bias
        val_loss, val_prec1, val_prec5 = validate(val_loader,
                                                  model,
                                                  criterion,
                                                  0,
                                                  None,
                                                  input_bias=input_bias,
                                                  eps_in=2. / 255,
                                                  mode='qd',
                                                  shorten=10)
        print("[NEMO] QD model: top-1=%.2f top-5=%.2f" %
              (val_prec1, val_prec5))
        model.id_stage()
        model.model[0][0].value = input_bias * (255. / 2)
        val_loss, val_prec1, val_prec5 = validate(val_loader,
                                                  model,
                                                  criterion,
                                                  0,
                                                  None,
                                                  input_bias=input_bias,
                                                  eps_in=2. / 255,
                                                  mode='id',
                                                  shorten=10)
        print("[NEMO] ID model: top-1=%.2f top-5=%.2f" %
              (val_prec1, val_prec5))
        nemo.utils.export_onnx('mobilenet_%s_%d.onnx' %
                               (mobilenet_width_s, mobilenet_input),
                               model,
                               model, (3, mobilenet_input, mobilenet_input),
                               perm=None)
        import sys
        sys.exit(0)

    if args.terminal:
        fqs = copy.deepcopy(model.state_dict())
        model.freeze_bn(reset_stats=True, disable_grad=True)
        bin_fq, bout_fq, _ = nemo.utils.get_intermediate_activations(
            model, validate, val_loader, model, criterion, 0, None, shorten=1)

        torch.save({'in': bin_fq['model.0.0'][0]}, "input_fq.pth")

        val_loss, val_prec1, val_prec5 = validate(val_loader, model, criterion,
                                                  0, None)
        print("[NEMO] FQ model: top-1=%.2f top-5=%.2f" %
              (val_prec1, val_prec5))

        input_bias_dict = {'model.0.0': +1.0, 'model.0.1': +1.0}
        remove_bias_dict = {'model.0.1': 'model.0.2'}
        input_bias = math.floor(1.0 / (2. / 255)) * (2. / 255)

        model.qd_stage(eps_in=2. / 255,
                       add_input_bias_dict=input_bias_dict,
                       remove_bias_dict=remove_bias_dict,
                       int_accurate=True)

        # fix ConstantPad2d
        model.model[0][0].value = input_bias

        val_loss, val_prec1, val_prec5 = validate(val_loader,
                                                  model,
                                                  criterion,
                                                  0,
                                                  None,
                                                  input_bias=input_bias,
                                                  eps_in=2. / 255,
                                                  mode='qd',
                                                  shorten=50)
        print("[NEMO] QD model: top-1=%.2f top-5=%.2f" %
              (val_prec1, val_prec5))

        qds = copy.deepcopy(model.state_dict())
        bin_qd, bout_qd, _ = nemo.utils.get_intermediate_activations(
            model,
            validate,
            val_loader,
            model,
            criterion,
            0,
            None,
            input_bias=input_bias,
            eps_in=2. / 255,
            mode='qd',
            shorten=1)

        torch.save({'qds': qds, 'fqs': fqs}, "states.pth")
        torch.save({'in': bin_qd['model.0.0'][0]}, "input_qd.pth")

        diff = collections.OrderedDict()
        for k in bout_fq.keys():
            diff[k] = (bout_fq[k] - bout_qd[k]).to('cpu').abs()

        for i in range(0, 26):
            for j in range(3, 4):
                k = 'model.%d.%d' % (i, j)
                kn = 'model.%d.%d' % (i if j < 3 else i + 1,
                                      j + 1 if j < 3 else 0)
                eps = model.get_eps_at(kn, eps_in=2. / 255)[0]
                print("%s:" % k)
                idx = diff[k] > eps
                n = idx.sum()
                t = (diff[k] > -1e9).sum()
                max_eps = torch.ceil(
                    diff[k].max() /
                    model.get_eps_at('model.%d.0' %
                                     (i + 1), 2. / 255)[0]).item()
                mean_eps = torch.ceil(
                    diff[k][idx].mean() /
                    model.get_eps_at('model.%d.0' %
                                     (i + 1), 2. / 255)[0]).item()
                try:
                    print("  max:   %.3f (%d eps)" %
                          (diff[k].max().item(), max_eps))
                    print("  mean:  %.3f (%d eps) (only diff. elements)" %
                          (diff[k][idx].mean().item(), mean_eps))
                    print("  #diff: %d/%d (%.1f%%)" %
                          (n, t, float(n) / float(t) * 100))
                except ValueError:
                    print("  #diff: 0/%d (0%%)" % (t, ))

        model.id_stage()
        # fix ConstantPad2d
        model.model[0][0].value = input_bias * (255. / 2)

        ids = model.state_dict()
        bin_id, bout_id, _ = nemo.utils.get_intermediate_activations(
            model,
            validate,
            val_loader,
            model,
            criterion,
            0,
            None,
            input_bias=input_bias,
            eps_in=2. / 255,
            mode='id',
            shorten=1)

        val_loss, val_prec1, val_prec5 = validate(val_loader,
                                                  model,
                                                  criterion,
                                                  0,
                                                  None,
                                                  input_bias=input_bias,
                                                  eps_in=2. / 255,
                                                  mode='id',
                                                  shorten=50)
        print("[NEMO] ID model: top-1=%.2f top-5=%.2f" %
              (val_prec1, val_prec5))

        try:
            os.makedirs("golden")
        except Exception:
            pass

        torch.save({'in': bin_fq['model.0.0'][0]}, "input_id.pth")

        diff = collections.OrderedDict()
        for i in range(0, 26):
            for j in range(3, 4):
                k = 'model.%d.%d' % (i, j)
                kn = 'model.%d.%d' % (i if j < 3 else i + 1,
                                      j + 1 if j < 3 else 0)
                eps = model.get_eps_at(kn, eps_in=2. / 255)[0]
                diff[k] = (bout_id[k] * eps - bout_qd[k]).to('cpu').abs()
                print("%s:" % k)
                idx = diff[k] >= eps
                n = idx.sum()
                t = (diff[k] > -1e9).sum()
                max_eps = torch.ceil(diff[k].max() / eps).item()
                mean_eps = torch.ceil(diff[k][idx].mean() / eps).item()
                try:
                    print("  max:   %.3f (%d eps)" %
                          (diff[k].max().item(), max_eps))
                    print("  mean:  %.3f (%d eps) (only diff. elements)" %
                          (diff[k][idx].mean().item(), mean_eps))
                    print("  #diff: %d/%d (%.1f%%)" %
                          (n, t, float(n) / float(t) * 100))
                except ValueError:
                    print("  #diff: 0/%d (0%%)" % (t, ))
        import IPython
        IPython.embed()

        bidx = 0
        for n, m in model.named_modules():
            try:
                actbuf = bin_id[n][0][bidx].permute((1, 2, 0))
            except RuntimeError:
                actbuf = bin_id[n][0][bidx]
            np.savetxt("golden/golden_input_%s.txt" % n,
                       actbuf.cpu().detach().numpy().flatten(),
                       header="input (shape %s)" % (list(actbuf.shape)),
                       fmt="%.3f",
                       delimiter=',',
                       newline=',\n')
        for n, m in model.named_modules():
            try:
                actbuf = bout_id[n][bidx].permute((1, 2, 0))
            except RuntimeError:
                actbuf = bout_id[n][bidx]
            np.savetxt("golden/golden_%s.txt" % n,
                       actbuf.cpu().detach().numpy().flatten(),
                       header="%s (shape %s)" % (n, list(actbuf.shape)),
                       fmt="%.3f",
                       delimiter=',',
                       newline=',\n')
        nemo.utils.export_onnx("model_int.onnx",
                               model,
                               model, (3, 224, 224),
                               perm=None)

        val_loss, val_prec1, val_prec5 = validate(val_loader,
                                                  model,
                                                  criterion,
                                                  0,
                                                  None,
                                                  input_bias=input_bias,
                                                  eps_in=2. / 255)
        print("[NEMO] ID model: top-1=%.2f top-5=%.2f" %
              (val_prec1, val_prec5))

        import IPython
        IPython.embed()
        import sys
        sys.exit(0)

    for epoch in range(args.start_epoch, args.epochs):
        #        optimizer = adjust_optimizer(optimizer, epoch, regime)

        # train for one epoch
        train_loss, train_prec1, train_prec5 = train(
            train_loader,
            model,
            criterion,
            epoch,
            optimizer,
            freeze_bn=True if epoch > 0 else False,
            absorb_bn=True if epoch == 0 else False)
        val_loss, val_prec1, val_prec5 = validate(val_loader, model, criterion,
                                                  epoch)

        # remember best prec@1 and save checkpoint
        is_best = val_prec1 > best_prec1
        best_prec1 = max(val_prec1, best_prec1)

        #save_model
        if args.save_check:
            nemo.utils.save_checkpoint(
                model,
                optimizer,
                0,
                acc=val_prec1,
                checkpoint_name='mobilenet_%s_%d%s_checkpoint' %
                (mobilenet_width_s, mobilenet_input,
                 "_mixed" if mixed_prec_dict is not None else ""),
                checkpoint_suffix=args.suffix)

        if is_best:
            nemo.utils.save_checkpoint(
                model,
                optimizer,
                0,
                acc=val_prec1,
                checkpoint_name='mobilenet_%s_%d%s_best' %
                (mobilenet_width_s, mobilenet_input,
                 "_mixed" if mixed_prec_dict is not None else ""),
                checkpoint_suffix=args.suffix)

        logging.info('\n Epoch: {0}\t'
                     'Training Loss {train_loss:.4f} \t'
                     'Training Prec@1 {train_prec1:.3f} \t'
                     'Training Prec@5 {train_prec5:.3f} \t'
                     'Validation Loss {val_loss:.4f} \t'
                     'Validation Prec@1 {val_prec1:.3f} \t'
                     'Validation Prec@5 {val_prec5:.3f} \t'.format(
                         epoch + 1,
                         train_loss=train_loss,
                         val_loss=val_loss,
                         train_prec1=train_prec1,
                         val_prec1=val_prec1,
                         train_prec5=train_prec5,
                         val_prec5=val_prec5))

        results.add(epoch=epoch + 1,
                    train_loss=train_loss,
                    val_loss=val_loss,
                    train_error1=100 - train_prec1,
                    val_error1=100 - val_prec1,
                    train_error5=100 - train_prec5,
                    val_error5=100 - val_prec5)
        results.save()
def main():
    global args, best_prec1
    best_prec1 = 0
    args = parser.parse_args()

    if args.evaluate:
        args.results_dir = '/tmp'
    if args.save is '':
        args.save = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

    save_name = args.model+"_"+args.majority+"_pad="+str(args.padding)+"_Data="+args.dataset
    
    if (args.resume != ''):
        save_name = save_name + "_resume"

    save_path = os.path.join(args.results_dir, save_name)
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    else:
        # append datatime
        tim = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
        overwrite = input ("Directory {} already exists. Would you like to overwrite (y/n): ".format(save_path))
        if (overwrite == "y"):
            save_path = save_path
        else:
            save_path = save_path+"_{}".format(tim)
            os.makedirs(save_path)

    setup_logging(os.path.join(save_path, 'log.txt'))
    results_file = os.path.join(save_path, 'results.%s')
    results = ResultsLog(results_file % 'csv', results_file % 'html')

    logging.info("saving to %s", save_path)
    logging.debug("run arguments: %s", args)

    logging.info("setting up tensorboard")
    writer = SummaryWriter(log_dir=save_path)

    if 'cuda' in args.type:
        args.gpus = [int(i) for i in args.gpus.split(',')]
        torch.cuda.set_device(args.gpus[0])
        cudnn.benchmark = True
    else:
        args.gpus = None

    # create model
    logging.info("creating model %s", args.model)
    model = models.__dict__[args.model]
    
    args.num_classes = get_num_classes(args.dataset)
    model_config = {'input_size': args.input_size, 'dataset': args.dataset, 'backprop': args.backprop,
                    'majority': args.majority, 'padding': args.padding, 'num_classes': args.num_classes, 'depth': args.depth}

    if args.model_config is not '':
        model_config = dict(model_config, **literal_eval(args.model_config))

    model = model(**model_config)
    logging.info("created model with configuration: %s", model_config)

    # optionally resume from a checkpoint
    if args.evaluate:
        if not os.path.isfile(args.evaluate):
            parser.error('invalid checkpoint: {}'.format(args.evaluate))
        checkpoint = torch.load(args.evaluate)
        model.load_state_dict(checkpoint['state_dict'])
        logging.info("loaded checkpoint '%s' (epoch %s)",
                     args.evaluate, checkpoint['epoch'])
    elif args.resume:
        checkpoint_file = args.resume
        if os.path.isdir(checkpoint_file):
            results.load(os.path.join(checkpoint_file, 'results.csv'))
            checkpoint_file = os.path.join(
                checkpoint_file, 'model_best.pth.tar')
        if os.path.isfile(checkpoint_file):
            logging.info("loading checkpoint '%s'", args.resume)
            checkpoint = torch.load(checkpoint_file)
            args.start_epoch = checkpoint['epoch'] - 1
            best_prec1 = checkpoint['best_prec1']
            model.load_state_dict(checkpoint['state_dict'])
            logging.info("loaded checkpoint '%s' (epoch %s)",
                         checkpoint_file, checkpoint['epoch'])
        else:
            logging.error("no checkpoint found at '%s'", args.resume)

    num_parameters = sum([l.nelement() for l in model.parameters()])
    logging.info("number of parameters: %d", num_parameters)

    # Data loading code
    default_transform = {
        'train': get_transform(args.dataset,
                               input_size=args.input_size, augment=True),
        'eval': get_transform(args.dataset,
                              input_size=args.input_size, augment=False)
    }
    transform = getattr(model, 'input_transform', default_transform)
    regime = getattr(model, 'regime', {0: {'optimizer': args.optimizer,
                                           'lr': args.lr,
                                           'momentum': args.momentum,
                                           'weight_decay': args.weight_decay}})
    # define loss function (criterion) and optimizer
    criterion = getattr(model, 'criterion', nn.CrossEntropyLoss)()
    criterion.type(args.type)
    model.type(args.type)

    val_data = get_dataset(args.dataset, 'val', transform['eval'])
    val_loader = torch.utils.data.DataLoader(
        val_data,
        batch_size=args.batch_size, shuffle=False,
        num_workers=args.workers, pin_memory=True)

    if args.evaluate:
        validate(val_loader, model, criterion, 0)
        return

    train_data = get_dataset(args.dataset, 'train', transform['train'])
    train_loader = torch.utils.data.DataLoader(
        train_data,
        batch_size=args.batch_size, shuffle=True,
        num_workers=args.workers, pin_memory=True)

    optimizer = torch.optim.SGD(model.parameters(), lr=args.lr)
    logging.info('training regime: %s', regime)

    for epoch in range(args.start_epoch, args.epochs):
        optimizer = adjust_optimizer(optimizer, epoch, regime)
        lr = optimizer.param_groups[0]['lr']

        # user function in utils.py to override regime and update learning rate
        lr = lr_schedule(lr, epoch, args.epochs, start_epoch=args.start_epoch)
        writer.add_scalar("lr", lr, epoch)
        adjust_learning_rate(optimizer, lr)    

        # train for one epoch
        train_loss, train_prec1, train_prec5 = train(
            train_loader, model, criterion, epoch, optimizer)

        # evaluate on validation set
        val_loss, val_prec1, val_prec5 = validate(
            val_loader, model, criterion, epoch)

        # remember best prec@1 and save checkpoint
        is_best = val_prec1 > best_prec1
        best_prec1 = max(val_prec1, best_prec1)

        save_checkpoint({
            'epoch': epoch + 1,
            'model': args.model,
            'config': args.model_config,
            'state_dict': model.state_dict(),
            'best_prec1': best_prec1,
            'regime': regime
        }, is_best, path=save_path)
        logging.info('\n Epoch: {0}\t'
                     'lr {lr: .5f} \t'
                     'Training Loss {train_loss:.4f} \t'
                     'Training Prec@1 {train_prec1:.3f} \t'
                     'Training Prec@5 {train_prec5:.3f} \t'
                     'Validation Loss {val_loss:.4f} \t'
                     'Validation Prec@1 {val_prec1:.3f} \t'
                     'Validation Prec@5 {val_prec5:.3f} \n'
                     .format(epoch + 1, lr=lr, train_loss=train_loss, val_loss=val_loss,
                             train_prec1=train_prec1, val_prec1=val_prec1,
                             train_prec5=train_prec5, val_prec5=val_prec5))

        # adds results to html log file
        results.add(epoch=epoch + 1, train_loss=train_loss, val_loss=val_loss,
                    train_error1=100 - train_prec1, val_error1=100 - val_prec1,
                    train_error5=100 - train_prec5, val_error5=100 - val_prec5,
                    lr=lr)

        # also add results to tensorboard summary writer
        train_res = {
            'loss': train_loss,
            'accuracy': train_prec1,
        }
        log_result(writer, "train", train_res, epoch+1)
        val_res = {
            'loss': val_loss,
            'accuracy': val_prec1,
        }
        log_result(writer, "val", val_res, epoch+1)

        #results.plot(x='epoch', y=['train_loss', 'val_loss'],
        #             title='Loss', ylabel='loss')
        #results.plot(x='epoch', y=['train_error1', 'val_error1'],
        #             title='Error@1', ylabel='error %')
        #results.plot(x='epoch', y=['train_error5', 'val_error5'],
        #             title='Error@5', ylabel='error %')
        results.save()