Beispiel #1
0
def get_popdist_device(args, request_ipus):
    ipus_per_replica = request_ipus // args.replication_factor
    if not popdist.checkNumIpusPerReplica(ipus_per_replica):
        raise RuntimeError(f"The number IPUs per replica ({ipus_per_replica}) required for the model configuration"
                           f" does not match the specified popdist IPUs per replica ({popdist.getNumIpusPerReplica()})")
    args.device_id = popdist.getDeviceId(ipus_per_replica)
    return get_device_by_id(args, request_ipus)
Beispiel #2
0
if __name__ == '__main__':
    tf.logging.set_verbosity(tf.logging.ERROR)

    opts = make_global_options([add_pretraining_options])

    opts['shards'] = ipu_utils.next_power_of_two(
        max(opts["device_mapping"]) + 1)

    if popdist.isPopdistEnvSet():
        opts['use_popdist'] = True
        opts['replicas'] = popdist.getNumLocalReplicas()
        opts['total_replicas'] = popdist.getNumTotalReplicas()
        if opts['compile_only']:
            opts['select_ipu'] = None
        else:
            opts['select_ipu'] = popdist.getDeviceId()
    else:
        opts['use_popdist'] = False
        opts['total_replicas'] = opts['replicas']
        opts['select_ipu'] = None

    set_defaults(opts)

    set_poplar_engine_options(execution_profile=opts['execution_profile'],
                              memory_profile=opts['memory_profile'],
                              profile_dir=str(opts['profile_dir']),
                              sync_replicas_independently=opts['replicas'] > 1
                              and opts['sync_replicas_independently'],
                              synthetic_data=opts['synthetic_data'],
                              tensorflow_progress_bar=opts['progress_bar'])
Beispiel #3
0
                        type=str,
                        default="./ckpt_init/yolov3_coco_converted.fp16.ckpt",
                        help="ckpt init weight")

    arguments = parser.parse_args()
    with open(arguments.config) as f:
        opts = json.load(f)

    opts['train']['annot_path'] = arguments.train_path
    opts['train']['initial_weight'] = arguments.init_weight
    opts['test']['annot_path'] = arguments.test_path
    if popdist.isPopdistEnvSet():
        opts["use_popdist"] = True
        opts["train"]["replicas"] = popdist.getNumLocalReplicas()
        opts["train"]["total_replicas"] = popdist.getNumTotalReplicas()
        opts["select_ipu"] = popdist.getDeviceId(
            len(opts["train"]["device_mapping"]))
        opts["distributed_worker_count"] = int(popdist.getNumTotalReplicas() /
                                               popdist.getNumLocalReplicas())
        opts["distributed_worker_index"] = int(
            popdist.getReplicaIndexOffset() / popdist.getNumLocalReplicas())
        opts["use_popdist"] = True

    else:
        opts["use_popdist"] = False
        opts["train"]["total_replicas"] = opts["train"]["replicas"]
        opts["select_ipu"] = -1
        opts["distributed_worker_count"] = 1
        opts["distributed_worker_index"] = 0
        opts["use_popdist"] = False

    # for each instance will have difference seed, so data will be shuffled differently
Beispiel #4
0
        amps = opts['available_memory_proportion']
        if amps and len(amps) > 1:
            if not opts['pipeline']:
                raise ValueError(
                    '--available-memory-proportion should only have one value unless using pipelining'
                )
            if len(amps) != int(opts['shards']) * 2:
                raise ValueError(
                    '--available-memory-proportion should have either one value or 2*shards values specified'
                )

        if popdist.isPopdistEnvSet():
            opts['use_popdist'] = True
            opts['replicas'] = popdist.getNumLocalReplicas()
            opts['total_replicas'] = popdist.getNumTotalReplicas()
            opts['select_ipu'] = str(popdist.getDeviceId(opts['shards']))
        else:
            opts['use_popdist'] = False
            opts['total_replicas'] = opts['replicas']

        opts["command"] = ' '.join(sys.argv)
        set_defaults(model, lr_schedule, opts)

        if opts['dataset'] == 'imagenet':
            if opts['image_size'] is None:
                opts['image_size'] = 224
            if opts['image_size'] != 224:
                opts['name'] += '_{}x{}'.format(opts['image_size'],
                                                opts['image_size'])
            opts['summary_str'] += "Image Size: {}x{}\n".format(
                opts['image_size'], opts['image_size'])