Beispiel #1
0
def main():
    print('start job ...')
    start_time = time.time()

    # 1. create infrastructure
    supported_regions = [
        'cn-huhehaote', 'cn-shanghai', 'cn-zhangjiakou', 'cn-hangzhou',
        'cn-beijing'
    ]
    assert ncluster.get_region(
    ) in supported_regions, f"required AMI {IMAGE_NAME} has only been made available in regions {supported_regions}, but your current region is {ncluster.get_region()} (set $ALYUN_DEFAULT_REGION)"

    ncluster_globals.set_should_disable_nas(True)

    job = ncluster.make_job(name=args.name,
                            run_name=f"{args.name}-{args.machines}",
                            num_tasks=args.machines,
                            instance_type=INSTANCE_TYPE,
                            disable_nas=True,
                            spot=True,
                            install_script='')

    init_ncluster = time.time()
    print('init ncluster:', init_ncluster - start_time)

    # 2. upload GTC code
    job.run('yum install -y unzip')
    job.upload('GTC')
    job.run(
        'cd GTC && wget http://public-ai-datasets.oss-cn-huhehaote.aliyuncs.com/gtc-demo/dataset.zip '
        +
        '&& wget http://public-ai-datasets.oss-cn-huhehaote.aliyuncs.com/gtc-demo/test.JPG '
        +
        '&& wget http://public-ai-datasets.oss-cn-huhehaote.aliyuncs.com/gtc-demo/resnet50-19c8e357.pth '
        + '&& conda activate torch_1.3_cu10.0_py36')
    upload_data = time.time()
    print('upload_data time:', upload_data - init_ncluster)

    # 3. prepare the dataset
    job.run('unzip -o dataset.zip')
    unzip_time = time.time()
    print('unzip data:', unzip_time - upload_data)

    # 4. run the training job
    job.tasks[0].run('conda activate torch_1.3_cu10.0_py36')
    job.tasks[0].run('./run-perseus.sh 2>&1 | tee logs.log',
                     non_blocking=False)
    train_time = time.time()
    print('training time:', train_time - unzip_time)

    # 5. run the inference job
    job.tasks[0].run('python inference.py 2>&1 | tee logs.inference.log',
                     non_blocking=False)
    print('inference time:', time.time() - train_time)

    eclapse_time = time.time() - start_time
    print(f'training and inference deploy time is: {eclapse_time} s.')

    # 6. stop the instance (optional)
    job.stop()
def main():
    start_time = time.time()
    # 1. Create infrastructure
    supported_regions = [
        'cn-huhehaote', 'cn-zhangjiakou', 'cn-shanghai', 'cn-hangzhou',
        'cn-beijing'
    ]
    assert ncluster.get_region(
    ) in supported_regions, f"required AMI {IMAGE_NAME} has only been made available in regions {supported_regions}, but your current region is {ncluster.get_region()} (set $ALYUN_DEFAULT_REGION)"

    ncluster_globals.set_should_disable_nas(True)

    job = ncluster.make_job(name=args.name,
                            run_name=f"{args.name}-{args.machines}",
                            num_tasks=args.machines,
                            disable_nas=True,
                            spot=True,
                            instance_type=INSTANCE_TYPE)

    # 2. Upload perseus bert code.
    job.run('yum install -y unzip')
    job.upload('perseus-bert')
    job.run('conda activate tensorflow_1.14_cu10.0_py36')

    # 3. Download pretrain model and dataset.
    BERT_CHINESE_BASE_DIR = '/root/chinese_L-12_H-768_A-12'
    DATA_DIR = '/root/toutiao_data'
    job.run(
        'wget -c -t 10 https://public-ai-datasets.oss-cn-huhehaote.aliyuncs.com/chinese_L-12_H-768_A-12.zip  && unzip chinese_L-12_H-768_A-12.zip'
    )
    job.run(
        'wget -c -t 10 https://public-ai-datasets.oss-cn-huhehaote.aliyuncs.com/toutiao_data.tgz && tar xvf toutiao_data.tgz'
    )

    # 4. Run the training job.
    job.run('cd perseus-bert')
    hosts = [task.ip + f':{NUM_GPUS}' for task in job.tasks]
    host_str = ','.join(hosts)

    mpi_cmd = [
        'mpirun --allow-run-as-root', f'-np {args.machines * NUM_GPUS}',
        f'--npernode {NUM_GPUS}', f'--host {host_str}', '--bind-to none',
        '-x NCCL_DEBUG=INFO', '-x PATH', '-x PYTHONPATH', '-x LD_LIBRARY_PATH',
        '-x XLA_FLAGS'
    ]

    bert_classifier_cmd = [
        'python run_classifier.py', '--task_name=news', '--do_train=true',
        '--do_eval=true', f'--data_dir={DATA_DIR}',
        f'--vocab_file={BERT_CHINESE_BASE_DIR}/vocab.txt',
        f'--bert_config_file={BERT_CHINESE_BASE_DIR}/bert_config.json',
        f'--init_checkpoint={BERT_CHINESE_BASE_DIR}/bert_model.ckpt',
        '--max_seq_length=128', '--train_batch_size=48',
        '--learning_rate=8e-5', '--num_train_epochs=3.0',
        '--warmup_proportion=0.8', '--output_dir=/root/output_dir',
        '--use_amp=true', '--use_perseus=true', '--use_xla=true'
    ]

    cmd = mpi_cmd + bert_classifier_cmd
    cmd = " ".join(cmd)
    job.tasks[0].run(f'echo {cmd} > {job.logdir}/task-cmd')
    job.tasks[0].run(cmd, non_blocking=True)
    print(f"Logging to {job.logdir}")

    eclapse_time = time.time() - start_time
    print(f'training deploy time is: {eclapse_time} s.')

    job.stop()
def main():
    start_time = time.time()
    # 1. Create infrastructure
    supported_regions = [
        'cn-huhehaote', 'cn-zhangjiakou', 'cn-shanghai', 'cn-hangzhou',
        'cn-beijing'
    ]
    assert ncluster.get_region(
    ) in supported_regions, f"required AMI {IMAGE_NAME} has only been made available in regions {supported_regions}, but your current region is {ncluster.get_region()} (set $ALYUN_DEFAULT_REGION)"

    ncluster_globals.set_should_disable_nas(True)

    job = ncluster.make_job(
        name=args.name,
        run_name=f"{args.name}-{args.machines}",
        #image_name='aiacc-dlimg-centos7:1.3.0.post3',
        num_tasks=args.machines,
        instance_type=INSTANCE_TYPE,
        spot=True,
        disable_nas=True,
    )
    # 2. Upload perseus faster-rcnn code.
    job.upload('gluon-cv')
    job.run('conda activate mxnet_1.5.1.post0_cu10.0_py36')

    # 3. Download pretrain model and dataset.
    job.run('mkdir /root/mscoco')
    job.run(
        'cd /root/mscoco && wget -c -t 10 http://public-ai-datasets.oss-cn-huhehaote.aliyuncs.com/coco2017/annotations/annotations_trainval2017.zip'
    )
    job.run(
        'wget -c -t 10 http://public-ai-datasets.oss-cn-huhehaote.aliyuncs.com/coco2017/zips/train2017.zip'
    )
    job.run(
        'wget -c -t 10 http://public-ai-datasets.oss-cn-huhehaote.aliyuncs.com/coco2017/zips/test2017.zip'
    )
    job.run(
        'wget -c -t 10 http://public-ai-datasets.oss-cn-huhehaote.aliyuncs.com/coco2017/zips/val2017.zip'
    )

    job.run('mkdir -p /root/.mxnet/models')
    job.run(
        'cd /root/.mxnet/models && wget -c -t 10 http://public-ai-datasets.oss-cn-huhehaote.aliyuncs.com/pretrain_model/resnet50_v1b-0ecdba34.params'
    )

    # 4. install requirements.
    job.run('cd /root/gluon-cv/')
    job.run('pip install -r requirements.txt')

    job.run('python mscoco.py')

    # 5. Run the training job.
    hosts = [task.ip + f':{NUM_GPUS}' for task in job.tasks]
    host_str = ','.join(hosts)

    mpi_cmd = [
        'mpirun --allow-run-as-root',
        f'-np {args.machines * NUM_GPUS}',
        f'--npernode {NUM_GPUS}',
        f'--host {host_str}',
        '--bind-to none',
        '-x NCCL_DEBUG=INFO',
        '-x PATH',
        '-x LD_LIBRARY_PATH',
    ]

    insightface_cmd = './train-perseus.sh'

    cmd = mpi_cmd
    cmd = " ".join(cmd) + " " + insightface_cmd
    job.tasks[0].run(f'echo {cmd} > {job.logdir}/task-cmd')
    job.tasks[0].run(cmd, non_blocking=True)
    print(f"Logging to {job.logdir}")

    eclapse_time = time.time() - start_time
    print(f'training deploy time is: {eclapse_time} s.')
#!/usr/bin/env python

import argparse
import ncluster
import os
import time

from ncluster import ncluster_globals
ncluster_globals.set_should_disable_nas(True)

INSTANCE_TYPE = 'ecs.gn6v-c10g1.20xlarge'  # V100
#INSTANCE_TYPE = 'ecs.gn6v-c8g1.16xlarge'
#INSTANCE_TYPE = 'ecs.gn5-c8g1.14xlarge'
NUM_GPUS = 8

ncluster.set_backend('aliyun')
parser = argparse.ArgumentParser()
parser.add_argument(
    '--name',
    type=str,
    default='perseus-faster-rcnn',
    help=
    "name of the current run, used for machine naming and tensorboard visualization"
)
parser.add_argument('--machines',
                    type=int,
                    default=1,
                    help="how many machines to use")
args = parser.parse_args()