Example #1
0
def calculate_loss_vector(network, path, location_path, communicator):
    source = DataSource(path, opt.vocab_file, location_path,
                        opt.seqlength, opt.batchsize)
    # the curr row -> the curr col
    # the curr col -> the next row
    row_loss = C.log(C.softmax(network['model'].outputs[0]))
    col_loss = C.log(C.softmax(network['model'].outputs[1]))
    loss = C.combine([row_loss, col_loss])
    row_loss_vector = np.zeros((opt.vocabsize, vocab_sqrt))
    col_loss_vector = np.zeros((opt.vocabsize, vocab_sqrt))

    flag = True
    while flag:
        mb = source.next_minibatch(opt.seqlength * opt.batchsize * Communicator.num_workers(),
                                   Communicator.num_workers(),
                                   communicator.rank())
        result = loss.eval({
            network['row']: mb[source.input1],
            network['col']: mb[source.input2],
        })
        row_prob = result[loss.outputs[0]]
        col_prob = result[loss.outputs[1]]
        label1 = mb[source.word1].asarray()
        label2 = mb[source.word2].asarray()
        sequences = len(label1)
        for i in range(sequences):
            seqlength = len(row_prob[i])
            for j in range(seqlength):
                row_word = int(label1[i][j][0])
                col_word = int(label2[i][j][0])
                row_loss_vector[row_word] -= row_prob[i][j]
                col_loss_vector[col_word] -= col_prob[i][j]
        flag = not mb[source.input1].sweep_end
    return col_loss_vector, row_loss_vector
def bn_inception_train_and_eval(train_data, test_data, mean_data, num_quantization_bits=32, epoch_size=1281167, max_epochs=300, minibatch_size=None,
                         restore=True, log_to_file=None, num_mbs_per_log=100, gen_heartbeat=False, scale_up=False, profiling=False):
    _cntk_py.set_computation_network_trace_level(0)

    # NOTE: scaling up minibatch_size increases sample throughput. In 8-GPU machine,
    # ResNet110 samples-per-second is ~7x of single GPU, comparing to ~3x without scaling
    # up. However, bigger minibatch size on the same number of samples means less updates,
    # thus leads to higher training error. This is a trade-off of speed and accuracy
    if minibatch_size is None:
        mb_size = 32 * (Communicator.num_workers() if scale_up else 1)
    else:
        mb_size = minibatch_size

    progress_printer = ProgressPrinter(
        freq=num_mbs_per_log,
        tag='Training',
        log_to_file=log_to_file,
        rank=Communicator.rank(),
        gen_heartbeat=gen_heartbeat,
        num_epochs=max_epochs)

    network = create_bn_inception()
    trainer = create_trainer(network, epoch_size, max_epochs, mb_size, num_quantization_bits, progress_printer)
    train_source = create_image_mb_source(train_data, mean_data, True, total_number_of_samples=max_epochs * epoch_size)
    test_source = create_image_mb_source(test_data, mean_data, False, total_number_of_samples=FULL_DATA_SWEEP)
    train_and_test(network, trainer, train_source, test_source, mb_size, epoch_size, restore, profiling)
Example #3
0
def calculate_loss_vector(network, path, location_path, communicator):
    source = DataSource(path, opt.vocab_file, location_path,
                        opt.seqlength, opt.batchsize)
    # the curr row -> the curr col
    # the curr col -> the next row
    row_loss = C.log(C.softmax(network['model'].outputs[0]))
    col_loss = C.log(C.softmax(network['model'].outputs[1]))
    loss = C.combine([row_loss, col_loss])
    row_loss_vector = np.zeros((opt.vocabsize, vocab_sqrt))
    col_loss_vector = np.zeros((opt.vocabsize, vocab_sqrt))

    flag = True
    while flag:
        mb = source.next_minibatch(opt.seqlength * opt.batchsize * Communicator.num_workers(),
                                   Communicator.num_workers(),
                                   communicator.rank())
        result = loss.eval({
            network['row']: mb[source.input1],
            network['col']: mb[source.input2],
        })
        row_prob = result[loss.outputs[0]]
        col_prob = result[loss.outputs[1]]
        label1 = mb[source.word1].asarray()
        label2 = mb[source.word2].asarray()
        sequences = len(label1)
        for i in range(sequences):
            seqlength = len(row_prob[i])
            for j in range(seqlength):
                row_word = int(label1[i][j][0])
                col_word = int(label2[i][j][0])
                row_loss_vector[row_word] -= row_prob[i][j]
                col_loss_vector[col_word] -= col_prob[i][j]
        flag = not mb[source.input1].sweep_end
    return col_loss_vector, row_loss_vector
def bn_inception_train_and_eval(train_data, test_data, mean_data, num_quantization_bits=32, epoch_size=50000, max_epochs=200, minibatch_size=None,
                         restore=True, log_to_file=None, num_mbs_per_log=100, gen_heartbeat=False, scale_up=False, profiling=False):
    _cntk_py.set_computation_network_trace_level(0)

    # NOTE: scaling up minibatch_size increases sample throughput. In 8-GPU machine,
    # ResNet110 samples-per-second is ~7x of single GPU, comparing to ~3x without scaling
    # up. However, bigger minibatch size on the same number of samples means less updates,
    # thus leads to higher training error. This is a trade-off of speed and accuracy
    if minibatch_size is None:
        mb_size = 128 * (Communicator.num_workers() if scale_up else 1)
    else:
        mb_size = minibatch_size

    progress_printer = ProgressPrinter(
        freq=num_mbs_per_log,
        tag='Training',
        log_to_file=log_to_file,
        rank=Communicator.rank(),
        gen_heartbeat=gen_heartbeat,
        num_epochs=max_epochs)

    network = create_bn_inception()
    trainer = create_trainer(network, epoch_size, max_epochs, mb_size, num_quantization_bits, progress_printer)
    train_source = create_image_mb_source(train_data, mean_data, True, total_number_of_samples=max_epochs * epoch_size)
    test_source = create_image_mb_source(test_data, mean_data, False, total_number_of_samples=FULL_DATA_SWEEP)

    train_and_test(network, trainer, train_source, test_source, max_epochs, mb_size, epoch_size, restore, profiling)
Example #5
0
def main():
    prepare_dir()  # create the vocab dir and model dir

    network = create_model(vocab_sqrt)
    if opt.pre_model:
        network['model'].restore(opt.pre_model)
    log_number_of_parameters(network['model'])
    location_path = os.path.join(opt.vocabdir, opt.alloc_file)
    for i in range(len(opt.epochs)):
        train(network, location_path, i)
        location_path = get_k_round_location_path(i + 1)
    Communicator.finalize()
Example #6
0
def main():
    prepare_dir()  # create the vocab dir and model dir

    network = create_model(vocab_sqrt)
    if opt.pre_model:
        network['model'].restore(opt.pre_model)
    log_number_of_parameters(network['model'])
    location_path = os.path.join(opt.vocabdir, opt.alloc_file)
    for i in range(len(opt.epochs)):
        train(network, location_path, i)
        location_path = get_k_round_location_path(i + 1)
    Communicator.finalize()
def sequence_to_sequence_translator(train_data, test_data, epoch_size=908241, num_quantization_bits=default_quantization_bits, block_size=3200, warm_up=0, minibatch_size=72, max_epochs=10, randomize_data=False, log_to_file=None, num_mbs_per_log=10, gen_heartbeat=False):
    cntk.debugging.set_computation_network_trace_level(0)
    from _cntk_py import set_fixed_random_seed
    set_fixed_random_seed(1)

    from Sequence2Sequence import create_model

    distributed_sync_report_freq = None
    if block_size is not None:
        distributed_sync_report_freq = 1

    progress_printer = ProgressPrinter(freq=num_mbs_per_log,
        tag='Training',
        log_to_file=log_to_file,
        rank=Communicator.rank(),
        gen_heartbeat=gen_heartbeat,
        num_epochs=max_epochs,
        distributed_freq=distributed_sync_report_freq)

    # create inputs and create model
    model = create_model()

    train_reader = create_reader(train_data, randomize_data, size=max_epochs*epoch_size)
    test_reader = create_reader(test_data, False, size=max_epochs*epoch_size*10)

    train_and_test(model, train_reader, test_reader, block_size, num_quantization_bits, max_epochs, epoch_size, minibatch_size, progress_printer, warm_up)
Example #8
0
def sequence_to_sequence_translator(
        train_data,
        test_data,
        epoch_size=908241,
        num_quantization_bits=default_quantization_bits,
        block_size=3200,
        warm_up=0,
        minibatch_size=72,
        max_epochs=10,
        randomize_data=False,
        log_to_file=None,
        num_mbs_per_log=10,
        gen_heartbeat=False):
    cntk.debugging.set_computation_network_trace_level(0)
    from _cntk_py import set_fixed_random_seed
    set_fixed_random_seed(1)

    from Sequence2Sequence import create_model

    distributed_sync_report_freq = None
    if block_size is not None:
        distributed_sync_report_freq = 1

    progress_printer = ProgressPrinter(
        freq=num_mbs_per_log,
        tag='Training',
        log_to_file=log_to_file,
        rank=Communicator.rank(),
        gen_heartbeat=gen_heartbeat,
        num_epochs=max_epochs,
        distributed_freq=distributed_sync_report_freq)

    # create inputs and create model
    model = create_model()

    train_reader = create_reader(train_data,
                                 randomize_data,
                                 size=max_epochs * epoch_size)
    test_reader = create_reader(test_data,
                                False,
                                size=max_epochs * epoch_size * 10)

    train_and_test(model, train_reader, test_reader, block_size,
                   num_quantization_bits, max_epochs, epoch_size,
                   minibatch_size, progress_printer, warm_up)
def vgg19_train_and_eval(train_data,
                         test_data,
                         num_quantization_bits=32,
                         minibatch_size=128,
                         epoch_size=1281167,
                         max_epochs=80,
                         restore=True,
                         log_to_file=None,
                         num_mbs_per_log=None,
                         gen_heartbeat=False,
                         testing=False):
    _cntk_py.set_computation_network_trace_level(0)

    progress_printer = ProgressPrinter(freq=num_mbs_per_log,
                                       tag='Training',
                                       log_to_file=log_to_file,
                                       rank=Communicator.rank(),
                                       gen_heartbeat=gen_heartbeat,
                                       num_epochs=max_epochs)

    network = create_vgg19()
    trainer = create_trainer(network, epoch_size, num_quantization_bits,
                             progress_printer)
    train_source = create_image_mb_source(train_data,
                                          True,
                                          total_number_of_samples=max_epochs *
                                          epoch_size)

    if testing:
        # reduce number of samples for validation when testing
        num_of_validation_samples = max_epochs * epoch_size * 10
    else:
        num_of_validation_samples = FULL_DATA_SWEEP

    test_source = create_image_mb_source(
        test_data, False, total_number_of_samples=num_of_validation_samples)
    train_and_test(network, trainer, train_source, test_source, minibatch_size,
                   epoch_size, restore)
def vgg19_train_and_eval(train_data, test_data, num_quantization_bits=32, minibatch_size=128, epoch_size = 1281167, max_epochs=80,
                         restore=True, log_to_file=None, num_mbs_per_log=None, gen_heartbeat=False, testing=False):
    _cntk_py.set_computation_network_trace_level(0)

    progress_printer = ProgressPrinter(
        freq=num_mbs_per_log,
        tag='Training',
        log_to_file=log_to_file,
        rank=Communicator.rank(),
        gen_heartbeat=gen_heartbeat,
        num_epochs=max_epochs)

    network = create_vgg19()
    trainer = create_trainer(network, epoch_size, num_quantization_bits, progress_printer)
    train_source = create_image_mb_source(train_data, True, total_number_of_samples=max_epochs * epoch_size)

    if testing:
        # reduce number of samples for validation when testing
        num_of_validation_samples = max_epochs * epoch_size * 10
    else:
        num_of_validation_samples = FULL_DATA_SWEEP

    test_source = create_image_mb_source(test_data, False, total_number_of_samples=num_of_validation_samples)
    train_and_test(network, trainer, train_source, test_source, minibatch_size, epoch_size, restore)
        model_path = args['outputdir'] + "/models"
    if args['logdir'] is not None:
        log_dir = args['logdir']
    if args['device'] is not None:
        cntk.device.try_set_default_device(cntk.device.gpu(args['device']))

    data_path = args['datadir']

    if not os.path.isdir(data_path):
        raise RuntimeError("Directory %s does not exist" % data_path)

    os.chdir(data_path)

    mean_data = os.path.join(data_path, 'ImageNet1K_mean.xml')
    train_data = os.path.join(data_path, 'train_map.txt')
    test_data = os.path.join(data_path, 'val_map.txt')

    bn_inception_train_and_eval(train_data, test_data, mean_data,
                                epoch_size=args['epoch_size'],
                                num_quantization_bits=args['quantized_bits'],
                                max_epochs=args['num_epochs'],
                                minibatch_size=args["minibatch_size"],
                                restore=not args['restart'],
                                log_to_file=args['logdir'],
                                num_mbs_per_log=100,
                                gen_heartbeat=True,
                                scale_up=bool(args['scale_up']))

    # Must call MPI finalize when process exit without exceptions
    Communicator.finalize()
Example #12
0
def train(network, location_path, id):
    train_path = os.path.join(opt.datadir, opt.train_file)
    valid_path = os.path.join(opt.datadir, opt.valid_file)
    test_path = os.path.join(opt.datadir, opt.test_file)

    criterion = create_criterion(network)
    ce, pe = criterion[0], criterion[1]
    learner = create_learner(network['model'])
    learner = data_parallel_distributed_learner(learner)
    communicator = learner.communicator()
    trainer = C.Trainer(network['model'], (ce, pe), learner)

    # loop over epoch
    for epoch in range(opt.epochs[id]):
        source = DataSource(train_path, opt.vocab_file, location_path,
                            opt.seqlength, opt.batchsize)
        loss, metric, tokens, batch_id = 0, 0, 0, 0
        start_time = datetime.datetime.now()
        flag = True

        # loop over minibatch in the epoch
        while flag:
            mb = source.next_minibatch(opt.seqlength * opt.batchsize * Communicator.num_workers(),
                                       Communicator.num_workers(),
                                       communicator.rank())
            trainer.train_minibatch({
                network['row']: mb[source.input1],
                network['col']: mb[source.input2],
                network['row_label']: mb[source.label1],
                network['col_label']: mb[source.label2]
            })
            samples = trainer.previous_minibatch_sample_count
            loss += trainer.previous_minibatch_loss_average * samples
            metric += trainer.previous_minibatch_evaluation_average * samples
            tokens += samples
            batch_id += 1
            if Communicator.num_workers() > 1:
                communicator.barrier()
            if batch_id != 0 and batch_id % opt.freq == 0:
                diff_time = (datetime.datetime.now() - start_time)
                print("Epoch {:2}: Minibatch [{:5} - {:5}], loss = {:.6f}, error = {:.6f}, speed = {:3} tokens/s".format(
                    epoch + 1, batch_id - opt.freq + 1, batch_id,
                    loss / tokens, metric / tokens, tokens // diff_time.seconds))
            flag = not mb[source.input1].sweep_end

        # Evaluation action
        if communicator.is_main():
            valid_error = evaluate(network, valid_path, location_path)
            test_error = evaluate(network, test_path, location_path)
            print("Epoch {:2} Done : Valid error = {:.6f}, Test error = {:.6f}".format(epoch + 1, valid_error, test_error))
            network['model'].save(os.path.join(opt.outputdir, 'round{}_epoch{}_'.format(id, epoch) + opt.save))
        if Communicator.num_workers() > 1:
            communicator.barrier()
    # word allocate action
    row_loss, col_loss = calculate_loss_vector(network, train_path, location_path, communicator)
    if Communicator.num_workers() > 1:
        try:
            from mpi4py import MPI
            comm = MPI.COMM_WORLD
            if communicator.is_main():
                for i in range(1, Communicator.num_workers()):
                    row_loss_i, col_loss_i = comm.recv(source=i)
                    row_loss += row_loss_i
                    col_loss += col_loss_i
            else:
                data_send = [row_loss, col_loss]
                comm.send(data_send, 0)
        except:
            raise RuntimeError("Please install mpi4py if uses multi gpus!")
        communicator.barrier()
    if communicator.is_main():
        allocate_table(row_loss, col_loss,
                       opt.vocabsize, vocab_sqrt, opt.vocab_file,
                       get_k_round_location_path(id + 1))
    if args['outputdir'] is not None:
        model_path = args['outputdir'] + "/models"
    if args['logdir'] is not None:
        log_dir = args['logdir']
    if args['device'] is not None:
        cntk.device.try_set_default_device(cntk.device.gpu(args['device']))

    data_path = args['datadir']

    if not os.path.isdir(data_path):
        raise RuntimeError("Directory %s does not exist" % data_path)

    os.chdir(data_path)

    mean_data = os.path.join(data_path, 'CIFAR-10_mean.xml')
    train_data = os.path.join(data_path, 'train_map.txt')
    test_data = os.path.join(data_path, 'test_map.txt')

    bn_inception_train_and_eval(train_data, test_data, mean_data,
                                epoch_size=args['epoch_size'],
                                num_quantization_bits=args['quantized_bits'],
                                max_epochs=args['num_epochs'],
                                minibatch_size=args["minibatch_size"],
                                restore=not args['restart'],
                                log_to_file=args['logdir'],
                                num_mbs_per_log=100,
                                gen_heartbeat=True,
                                scale_up=bool(args['scale_up']),
                                profiling=args['profile'])
    Communicator.finalize()
Example #14
0
def train(network, location_path, id):
    train_path = os.path.join(opt.datadir, opt.train_file)
    valid_path = os.path.join(opt.datadir, opt.valid_file)
    test_path = os.path.join(opt.datadir, opt.test_file)

    criterion = create_criterion(network)
    ce, pe = criterion[0], criterion[1]
    learner = create_learner(network['model'])
    learner = data_parallel_distributed_learner(learner)
    communicator = learner.communicator()
    trainer = C.Trainer(network['model'], (ce, pe), learner)

    # loop over epoch
    for epoch in range(opt.epochs[id]):
        source = DataSource(train_path, opt.vocab_file, location_path,
                            opt.seqlength, opt.batchsize)
        loss, metric, tokens, batch_id = 0, 0, 0, 0
        start_time = datetime.datetime.now()
        flag = True

        # loop over minibatch in the epoch
        while flag:
            mb = source.next_minibatch(opt.seqlength * opt.batchsize * Communicator.num_workers(),
                                       Communicator.num_workers(),
                                       communicator.rank())
            trainer.train_minibatch({
                network['row']: mb[source.input1],
                network['col']: mb[source.input2],
                network['row_label']: mb[source.label1],
                network['col_label']: mb[source.label2]
            })
            samples = trainer.previous_minibatch_sample_count
            loss += trainer.previous_minibatch_loss_average * samples
            metric += trainer.previous_minibatch_evaluation_average * samples
            tokens += samples
            batch_id += 1
            if Communicator.num_workers() > 1:
                communicator.barrier()
            if batch_id != 0 and batch_id % opt.freq == 0:
                diff_time = (datetime.datetime.now() - start_time)
                print("Epoch {:2}: Minibatch [{:5} - {:5}], loss = {:.6f}, error = {:.6f}, speed = {:3} tokens/s".format(
                    epoch + 1, batch_id - opt.freq + 1, batch_id,
                    loss / tokens, metric / tokens, tokens // diff_time.seconds))
            flag = not mb[source.input1].sweep_end

        # Evaluation action
        if communicator.is_main():
            valid_error = evaluate(network, valid_path, location_path)
            test_error = evaluate(network, test_path, location_path)
            print("Epoch {:2} Done : Valid error = {:.6f}, Test error = {:.6f}".format(epoch + 1, valid_error, test_error))
            network['model'].save(os.path.join(opt.outputdir, 'round{}_epoch{}_'.format(id, epoch) + opt.save))
        if Communicator.num_workers() > 1:
            communicator.barrier()
    # word allocate action
    row_loss, col_loss = calculate_loss_vector(network, train_path, location_path, communicator)
    if Communicator.num_workers() > 1:
        try:
            from mpi4py import MPI
            comm = MPI.COMM_WORLD
            if communicator.is_main():
                for i in range(1, Communicator.num_workers()):
                    row_loss_i, col_loss_i = comm.recv(source=i)
                    row_loss += row_loss_i
                    col_loss += col_loss_i
            else:
                data_send = [row_loss, col_loss]
                comm.send(data_send, 0)
        except:
            raise RuntimeError("Please install mpi4py if uses multi gpus!")
        communicator.barrier()
    if communicator.is_main():
        allocate_table(row_loss, col_loss,
                       opt.vocabsize, vocab_sqrt, opt.vocab_file,
                       get_k_round_location_path(id + 1))
Example #15
0
 def finalize(self):
     if self._distributed:
         Communicator.finalize()