Esempio n. 1
0
def main(args):
    # get all paths
    print('chk1')
    GPUtil.showUtilization()
    # exit(0)
    model = GecBERTModel(vocab_path=args.vocab_path,
                         model_paths=args.model_path,
                         max_len=args.max_len, min_len=args.min_len,
                         iterations=args.iteration_count,
                         min_error_probability=args.min_error_probability,
                         min_probability=args.min_error_probability,
                         lowercase_tokens=args.lowercase_tokens,
                         model_name=args.transformer_model,
                         special_tokens_fix=args.special_tokens_fix,
                         log=False,
                         confidence=args.additional_confidence,
                         is_ensemble=args.is_ensemble,
                         weigths=args.weights,
                         prune_amount=args.prune_amount,
                         num_layers_to_keep=args.keep
                         )
    GPUtil.showUtilization()
    print('chk2')

    # print('model:', model)

    # exit(0)

    cnt_corrections = predict_for_file(args.input_file, args.output_file, model,
                                       batch_size=args.batch_size)
    # evaluate with m2 or ERRANT
    print(f"Produced overall corrections: {cnt_corrections}")
Esempio n. 2
0
def configure_tf_devices(visible_ids=None):
    # Do nothing if no visible GPU IDs
    if not visible_ids or visible_ids[0] == -1:
        return

    try:
        deviceIDs = GPUtil.getAvailable(order='load',
                                        limit=100,
                                        maxLoad=0.5,
                                        maxMemory=0.5,
                                        includeNan=False,
                                        excludeID=[],
                                        excludeUUID=[])
    except ValueError:
        cprint(NO_NVIDIA_GPUS, 'yellow')
        return

    deviceIDs = [id_ for id_ in deviceIDs if id_ in visible_ids]

    if not deviceIDs:
        cprint(NO_NVIDIA_GPUS, 'yellow')
        return

    if not deviceIDs:
        cprint(
            "Error: Currently, no GPU is eligible (available memory and load at <=50%)",
            "red")
        GPUtil.showUtilization()
    else:
        cprint(
            "GPUs with utilization and memory load <50%: {}".format(', '.join(
                [str(x) for x in deviceIDs])), "green")

    return deviceIDs
Esempio n. 3
0
 def on_epoch_end(self, epoch, logs={}):
     x, y = self.test_data
     loss, acc = self.model.evaluate(x, y, batch_size=2, verbose=0)
     self.test_loss.append(loss)
     self.test_acc.append(acc)
     GPUtil.showUtilization()
     print('\nTesting loss: {}, acc: {}\n'.format(loss, acc))
 def on_train_batch_begin(self, batch, logs=None):
     if (self.record == True):
         if (batch == 2 or batch == 20):
             with open('logs/BRNN_GPU_Utils.txt', 'a') as f:
                 with contextlib.redirect_stdout(f):
                     print('Batch {} Begin.'.format(batch))
                     GPUtil.showUtilization()
 def on_epoch_begin(self, epoch, logs=None):
     if (epoch == 5):
         self.record = True
         with open('logs/BRNN_GPU_Utils.txt', 'a') as f:
             with contextlib.redirect_stdout(f):
                 print('Epoch {} Begin.'.format(epoch))
                 GPUtil.showUtilization()
Esempio n. 6
0
File: lz.py Progetto: luzai/opt18
def get_dev(n=1, ok=range(8), mem=(0.1, 0.45), sleep=20):
    import GPUtil, time

    def _limit(devs, ok):
        devs = [int(dev) for dev in devs if int(dev) in ok]
        return devs

    def get_dev_one(mem):
        devs = GPUtil.getAvailable(order='memory', maxLoad=1, maxMemory=mem, limit=n)
        devs = _limit(devs, ok)
        if len(devs) >= n:
            logging.info('available {}'.format(devs))
            return devs
        else:
            return []

    logging.info('Auto select gpu')
    GPUtil.showUtilization()
    devs = []

    while len(devs) < n:
        devs = get_dev_one(mem[0])
        if devs: return devs
        devs = get_dev_one(mem[1])
        if devs: return devs

        print('no enough device available')
        GPUtil.showUtilization()
        time.sleep(sleep)
Esempio n. 7
0
 def forward(self, x):
   print("Net10a GPU")
   GPUtil.showUtilization()
   print("Net10a pre assigned x: " + str(x))
   x = self.features(x)  # do not flatten
   print("Net10a post assigned x: " + str(x))
   return x
Esempio n. 8
0
  def _make_layers(self, batch_norm=True):
    layers = []
    in_channels = self.in_channels
    for tup in self.cfg:
      assert (len(tup) == 2)
      print("adding cluster layer")
      GPUtil.showUtilization()

      out, dilation = tup
      sz = self.conv_size
      stride = 1
      pad = self.pad  # to avoid shrinking

      if out == 'M':
        layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
      elif out == 'A':
        layers += [nn.AvgPool2d(kernel_size=2, stride=2)]
      else:
        conv2d = nn.Conv2d(in_channels, out, kernel_size=sz,
                           stride=stride, padding=pad,
                           dilation=dilation, bias=False)
        if batch_norm:
          layers += [conv2d, nn.BatchNorm2d(out,
                                            track_running_stats=self.batchnorm_track),
                     nn.ReLU(inplace=True)]
        else:
          layers += [conv2d, nn.ReLU(inplace=True)]
        in_channels = out

    return nn.Sequential(*layers)
Esempio n. 9
0
    def train(self):
        r"""Training function.
        """
        # setup
        self.model.train()
        self.monitor.reset()
        self.optimizer.zero_grad()

        for iteration in range(self.total_iter_nums):
            iter_total = self.start_iter + iteration
            start = time.perf_counter()
            # load data
            batch = next(self.dataloader)
            volume, target = batch
            time1 = time.perf_counter()

            target_vis = target                        
            volume = volume.to(self.device, dtype=torch.float)
            volume = volume.unsqueeze(1)

            target = target[0].to(self.device, dtype=torch.long)
            target = target.squeeze(axis=1)                

            pred = self.model(volume)

            pred_vis = pred.argmax(1)
            pred_vis = pred_vis.unsqueeze(0).to(self.device, dtype=torch.float)
    
            loss = self.criterion(pred, target)

            # compute gradient
            loss.backward()
            if (iteration+1) % self.cfg.SOLVER.ITERATION_STEP == 0:
                self.optimizer.step()
                self.optimizer.zero_grad()

            # logging and update record
            do_vis = self.monitor.update(self.lr_scheduler, iter_total, loss, self.optimizer.param_groups[0]['lr']) 
        
            if do_vis:

                self.monitor.visualize(self.cfg, volume, target_vis, pred_vis, iter_total)
                # Display GPU stats using the GPUtil package.
                GPUtil.showUtilization(all=True)
            

            # Save model
            if (iter_total+1) % self.cfg.SOLVER.ITERATION_SAVE == 0:
                self.save_checkpoint(iter_total)

            # update learning rate
            self.lr_scheduler.step(loss) if self.cfg.SOLVER.LR_SCHEDULER_NAME == 'ReduceLROnPlateau' else self.lr_scheduler.step()

            end = time.perf_counter()
            print('[Iteration %05d] Data time: %.5f, Iter time:  %.5f' % (iter_total, time1 - start, end - start))

            # Release some GPU memory and ensure same GPU usage in the consecutive iterations according to 
            # https://discuss.pytorch.org/t/gpu-memory-consumption-increases-while-training/2770
            del loss, pred
 def on_epoch_end(self, epoch, logs=None):
     if (self.record == True):
         self.record = False
         with open('logs/BRNN_GPU_Utils.txt', 'a') as f:
             with contextlib.redirect_stdout(f):
                 print('Epoch {} End.'.format(epoch))
                 GPUtil.showUtilization()
                 print('---------------')
Esempio n. 11
0
def check_gpu_usage():

    old_stdout = sys.stdout
    sys.stdout = mystdout = StringIO()
    GPUtil.showUtilization()
    sys.stdout = old_stdout
    gpu_usage = mystdout.getvalue().strip().split('|')[-2].strip()

    return gpu_usage
Esempio n. 12
0
def main(args):

    pvutils.set_gpus_to_use(args)

    logging.info("Loading Config file: {}".format(args.config))
    config = json.load(args.config)

    logdir = pvorg.get_logdir_name(project=config['pyvision']['project_name'],
                                   bench=args.bench,
                                   cfg_file=args.config,
                                   prefix=args.name,
                                   timestamp=args.timestamp)

    pvorg.init_logdir(config, args.config, logdir)

    logging.info("Model initialized in: ")
    logging.info(logdir)

    if args.wait:
        import GPUtil
        while GPUtil.getGPUs()[0].memoryUtil > 0.1:
            logging.info("GPU 0 is beeing used.")
            GPUtil.showUtilization()
            sleep(60)

    if args.debug or args.train:

        sfile = config['pyvision']['entry_point']

        model_file = os.path.realpath(
            os.path.join(os.path.dirname(args.config), sfile))

        assert (os.path.exists(model_file))

        m = imp.load_source('model', model_file)

        mymodel = m.create_pyvision_model(config,
                                          logdir=logdir,
                                          debug=args.debug)

        if args.debug:
            restarts = 0
        else:
            restarts = args.restarts

        pvutils.robust_training(mymodel, restarts=restarts, subprocess=False)

        # Do forward pass
        # img_var = Variable(sample['image']).cuda() # NOQA
        # prediction = mymodel(img_var)
    else:
        logging.info("Initializing only mode. [Try train.py --train ]")
        logging.info("To start training run:")
        logging.info("    pv2 train {} --gpus".format(logdir))

    return logdir
Esempio n. 13
0
  def forward(self, x):
    results = []
    for i in range(self.num_sub_heads):
      print("GPU pre head forward")
      GPUtil.showUtilization()
      x_i = self.heads[i](x)
      x_i = F.interpolate(x_i, size=self.input_sz, mode="bilinear")
      results.append(x_i)

    return results
Esempio n. 14
0
    def parse(self):
        if not self.initialized:
            self.initialize()
        self.opt = self.parser.parse_args()

        # === processing options === begin ===
        # determine which GPU to use
        # auto, throw exception when no GPU is available
        if self.opt.gpu_ids == 'auto':
            GPUtil.showUtilization()
            deviceIDs = GPUtil.getAvailable(order='first', limit=4, maxLoad=0.5, maxMemory=0.5,
                                            excludeID=[], excludeUUID=[])
            deviceID_costs = [-1*x for x in deviceIDs]
            # reorder the deviceID according to the computational capacity, i.e., total memory size
            # memory size is divided by 1000 without remainder, to avoid small fluctuation
            gpus = GPUtil.getGPUs()
            memory_size_costs = [-1*(gpu.memoryTotal//1000) for gpu in gpus if (gpu.load < 0.5 and gpu.memoryUtil < 0.5)]
            names = [gpu.name for gpu in gpus if (gpu.load < 0.5 and gpu.memoryUtil < 0.5)]
            sorted_idx = np.lexsort((deviceID_costs, memory_size_costs))

            self.opt.gpu_ids = [deviceIDs[sorted_idx[0]]]
            print('### selected GPU PCI_ID: %d, Name: %s ###' % (self.opt.gpu_ids[0], names[sorted_idx[0]]))
        else:
            # split into integer list, manual or multi-gpu
            self.opt.gpu_ids = list(map(int, self.opt.gpu_ids.split(',')))

        self.opt.device = torch.device("cuda:%d" % self.opt.gpu_ids[0] if (torch.cuda.is_available() and len(self.opt.gpu_ids) >= 1) else "cpu")
        # cuda.select_device(self.opt.gpu_ids[0])
        # torch.cuda.set_device(self.opt.gpu_ids[0])

        # set unique display_id
        self.opt.display_id = int(self.opt.display_id + 100 * self.opt.gpu_ids[0])

        # assure that 2d & 3d rot are not conflicting
        assert ((self.opt.rot_3d & self.opt.rot_horizontal) == False)
        # === processing options === end ===

        args = vars(self.opt)

        print('------------ Options -------------')
        for k, v in sorted(args.items()):
            print('%s: %s' % (str(k), str(v)))
        print('-------------- End ----------------')

        # save to the disk
        expr_dir =  os.path.join(self.opt.checkpoints_dir, self.opt.name)
        util.mkdirs(expr_dir)
        file_name = os.path.join(expr_dir, 'opt.txt')
        with open(file_name, 'wt') as opt_file:
            opt_file.write('------------ Options -------------\n')
            for k, v in sorted(args.items()):
                opt_file.write('%s: %s\n' % (str(k), str(v)))
            opt_file.write('-------------- End ----------------\n')
        return self.opt
Esempio n. 15
0
def pick_device():
    try:
        GPUtil.showUtilization()
        # Get the first available GPU
        DEVICE_ID_LIST = GPUtil.getFirstAvailable()
        DEVICE_ID = DEVICE_ID_LIST[0]  # grab first element from list
        # Set CUDA_VISIBLE_DEVICES to mask out all other GPUs than the first available device id
        os.environ["CUDA_VISIBLE_DEVICES"] = str(DEVICE_ID)
        logging.debug('Device ID (unmasked): ' + str(DEVICE_ID))
    except:
        logging.exception('Cannot detect GPUs')
    def train(self):

        optimizer = torch.optim.Adam(itertools.chain(self.encoder.parameters(),
                                                     self.out.parameters()),
                                     lr=self.config.learning_rate)
        criterion = torch.nn.CrossEntropyLoss()  #torch.nn.MSELoss()]

        self.encoder.train()
        self.out.train()

        for e in range(1, self.config.epoch_size + 1):
            print(f'Start {e} epoch')
            for i, (content, target) in enumerate(self.train_loader):
                content = content.cuda()
                target = target.cuda()

                latent_feature = self.encoder(content)
                classification = self.out(latent_feature)

                loss = criterion(classification, target)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                if i % self.config.log_interval == 0:
                    import GPUtil
                    GPUtil.showUtilization()
                    now = datetime.datetime.now()
                    otherStyleTime = now.strftime("%Y-%m-%d %H:%M:%S")
                    print(otherStyleTime)
                    print('epoch: ', e, ' iter: ', i)
                    print('loss:', loss.cpu().item())

                    self.encoder.eval()
                    self.out.eval()

                    pred = self.out(self.encoder(content))
                    pred = torch.argmax(pred, -1)
                    acc = torch.sum((pred == target).float()) / target.shape[0]
                    print('accuracy :', acc.item())

                    torch.save(
                        {
                            'encoder': self.encoder.state_dict(),
                            'out': self.out.state_dict()
                        }, f'{self.model_state_dir}/epoch_{e}-iter_{i}.pth')
                    self.encoder.train()
                    self.out.train()
Esempio n. 17
0
def get_gpu_info():
    """
    :return:
    """
    gpulist = []
    GPUtil.showUtilization()

    # 获取多个GPU信息,存在列表
    for gpu in Gpus:
        print('GPU.id:', gpu.id)
        print('GPU总量:', gpu.memoryTotal)
        print('GPU使用量:', gpu.memoryUsed)
        print('GPU使用占比:', gpu.memoryUtil * 100)
        print('GPU.id:', gpu.id)
        # 按GPU逐个添加信息
        gpulist.append(
            [gpu.id, gpu.memoryTotal, gpu.memoryUsed, gpu.memoryUtil * 100])
    """    
    根据GPU负载以及显存使用量返回可用GPU_id列表
    first: 返回的gpu可用id按升序排列
    limit: 返回可用GPU的id数量
    maxload: GPU负载率最大限制(超过该值,将不会返回)
    maxMemory:  GPU显存使用率最大限制(超过该值,将不会返回)
    includeNan:  是否包括负载或内存使用为NaN的GPU
    excludeID:  排除的GPU_id列表
    excludeUUID:  类似excludeID,将ID替换成UUID
    """
    GPUavailable = GPUtil.getAvailable(order='first',
                                       limit=1,
                                       maxLoad=0.5,
                                       maxMemory=0.5,
                                       includeNan=False,
                                       excludeID=[],
                                       excludeUUID=[])
    gpulist.append(GPUavailable)
    """
    根据GPU负载以及显存使用量返回第一个可用GPU_id,当无可用GPU时,将报错
    getAvailable参数均可用,含义一致
    attempts: 表示无法获取可用GPU时,尝试重复获取次数
    interval:  表示每次获取可用GPU时,时间间隔(秒)
    verbose:  表示在获取到最佳可用GPU时,是否打印尝试次数
    """
    GPUfirstavailable = GPUtil.getFirstAvailable(order='first',
                                                 attempts=1,
                                                 interval=900,
                                                 verbose=False)

    gpulist.append(GPUfirstavailable)
    return gpulist
Esempio n. 18
0
 def _initialize_weights(self, mode='fan_in'):
   for m in self.modules():
     print("GPU pre module")
     GPUtil.showUtilization()
     if isinstance(m, nn.Conv2d):
       nn.init.kaiming_normal_(m.weight, mode=mode, nonlinearity='relu')
       if m.bias is not None:
         m.bias.data.zero_()
     elif isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm1d):
       assert (m.track_running_stats == self.batchnorm_track)
       m.weight.data.fill_(1)
       m.bias.data.zero_()
     elif isinstance(m, nn.Linear):
       m.weight.data.normal_(0, 0.01)
       m.bias.data.zero_()
Esempio n. 19
0
def test_render_rendering_cleaning():
    for i in range(5):
        renderer = MeshRenderer(width=800, height=600)
        renderer.load_object(
            os.path.join(dir, 'mesh/bed1a77d92d64f5cbbaaae4feed64ec1_new.obj'))
        renderer.add_instance(0)
        renderer.set_camera([0, 0, 1.2], [0, 1, 1.2], [0, 1, 0])
        renderer.set_fov(90)
        rgb, _, seg, _ = renderer.render()
        assert (np.allclose(np.mean(rgb, axis=(0, 1)),
                            np.array([0.51661223, 0.5035339, 0.4777793, 1.]),
                            rtol=1e-3))
        GPUtil.showUtilization()
        renderer.release()
        GPUtil.showUtilization()
Esempio n. 20
0
def run_fixed_lambda_bbcluster(train_cluster_data, val_cluster_data, test_cluster_data, output_path, train_batch_size, eval_steps,
                               num_epochs, warmup_frac, lambda_val, reg, beta, loss_name, use_model_device, model_name='distilbert-base-uncased', out_features=256):
    task = Task.init(project_name='BB Clustering', task_name='bbclustering_fixed_lambda')
    config_dict = {'lambda_val': lambda_val, 'reg': reg}
    config_dict = task.connect(config_dict)
    if torch.cuda.is_available():
        device = torch.device('cuda')
        print('CUDA is available and using device: '+str(device))
    else:
        device = torch.device('cpu')
        print('CUDA not available, using device: '+str(device))
    ### Configure sentence transformers for training and train on the provided dataset
    # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
    word_embedding_model = models.Transformer(model_name)

    # Apply mean pooling to get one fixed sized sentence vector
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                   pooling_mode_mean_tokens=True,
                                   pooling_mode_cls_token=False,
                                   pooling_mode_max_tokens=False)

    doc_dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=out_features,
                                   activation_function=nn.Tanh())

    model = CustomSentenceTransformer(modules=[word_embedding_model, pooling_model, doc_dense_model])
    # model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
    GPUtil.showUtilization()
    if loss_name == 'bbspec':
        loss_model = BBSpectralClusterLossModel(model=model, device=device,
                                                lambda_val=config_dict.get('lambda_val', lambda_val),
                                                reg_const=config_dict.get('reg', reg), beta=beta)
    else:
        loss_model = BBClusterLossModel(model=model, device=device,
                                        lambda_val=config_dict.get('lambda_val', lambda_val),
                                        reg_const=config_dict.get('reg', reg))
    # reg_loss_model = ClusterDistLossModel(model=model)

    train_dataloader = DataLoader(train_cluster_data, shuffle=True, batch_size=train_batch_size)
    GPUtil.showUtilization()
    # train_dataloader2 = DataLoader(train_cluster_data, shuffle=True, batch_size=train_batch_size)
    evaluator = ClusterEvaluator.from_input_examples(val_cluster_data, use_model_device)
    test_evaluator = ClusterEvaluator.from_input_examples(test_cluster_data, use_model_device)
    GPUtil.showUtilization()
    warmup_steps = int(len(train_dataloader) * num_epochs * warmup_frac)  # 10% of train data

    print("Raw BERT embedding performance")
    model.to(device)
    evaluator(model, output_path)
    GPUtil.showUtilization()

    # Train the model
    model.fit(train_objectives=[(train_dataloader, loss_model)],
              evaluator=evaluator,
              test_evaluator=test_evaluator,
              epochs=num_epochs,
              evaluation_steps=eval_steps,
              warmup_steps=warmup_steps,
              output_path=output_path)
Esempio n. 21
0
def checkgpu():
    '''check gpu availability and utilization'''
    card = gpu.getGPUs()
    isavailable = gpu.getAvailability(card, maxLoad=.6)
    print(time.ctime())
    if isavailable == [1]:
        print("can mine")
        time.sleep(5)
        return 'isavailable'

    if isavailable == [0]:
        print("gpu in use")
        gpu.showUtilization()
        time.sleep(5)

        return 'notavailable'
Esempio n. 22
0
def get_gpu_info():
    '''
    :return:
    '''
    gpulist = []
    GPUtil.showUtilization()

    for gpu in Gpus:
        print('gpu.id:', gpu.id)
        print('Total GPU:', gpu.memoryTotal)
        print('GPU usage:', gpu.memoryUsed)
        print('gpu Util percentage:', gpu.memoryUtil * 100)
        gpulist.append(
            [gpu.id, gpu.memoryTotal, gpu.memoryUsed, gpu.memoryUtil * 100])

    return gpulist
 def get_gpu_memory_owned():
     try:
         gpu_usage = GPUtil.showUtilization()
         COMPONENTS_INFO["gpu_memory_owned"][
             "message"] = f"GPU memory - Total: {'%.2f' % float(gpu_usage.memoryTotal / 1000)}GB"
     except UnboundLocalError:
         COMPONENTS_INFO["gpu_memory_owned"][
             "message"] = "GPU memory - Total: None"
Esempio n. 24
0
def get_gpu_info():
    '''
    :return:
    '''
    gpulist = []
    GPUtil.showUtilization()

    # 获取多个GPU的信息,存在列表里
    for gpu in Gpus:
        print('gpu.id:', gpu.id)
        print('GPU总量:', gpu.memoryTotal)
        print('GPU使用量:', gpu.memoryUsed)
        print('gpu使用占比:', gpu.memoryUtil * 100)
        # 按GPU逐个添加信息
        gpulist.append(
            [gpu.id, gpu.memoryTotal, gpu.memoryUsed, gpu.memoryUtil * 100])

    return gpulist
Esempio n. 25
0
def test_render_rendering_cleaning():
    download_assets()
    test_dir = os.path.join(gibson2.assets_path, 'test')

    for i in range(5):
        renderer = MeshRenderer(width=800, height=600)
        renderer.load_object(
            os.path.join(test_dir,
                         'mesh/bed1a77d92d64f5cbbaaae4feed64ec1_new.obj'))
        renderer.add_instance(0)
        renderer.set_camera([0, 0, 1.2], [0, 1, 1.2], [0, 1, 0])
        renderer.set_fov(90)
        rgb = renderer.render(('rgb'))[0]
        assert (np.sum(rgb, axis=(0, 1, 2)) > 0)

        GPUtil.showUtilization()
        renderer.release()
        GPUtil.showUtilization()
Esempio n. 26
0
    def _train_misc(self, loss, pred, volume, target, weight, iter_total,
                    losses_vis):
        self.backward_pass(loss)  # backward pass

        # logging and update record
        if hasattr(self, 'monitor'):
            do_vis = self.monitor.update(iter_total, loss, losses_vis,
                                         self.optimizer.param_groups[0]['lr'])
            if do_vis:
                self.monitor.visualize(volume, target, pred, weight,
                                       iter_total)
                if torch.cuda.is_available():
                    GPUtil.showUtilization(all=True)

        # Save model
        if (iter_total + 1) % self.cfg.SOLVER.ITERATION_SAVE == 0:
            self.save_checkpoint(iter_total)

        if (iter_total + 1) % self.cfg.SOLVER.ITERATION_VAL == 0:
            self.validate(iter_total)

        # update learning rate
        self.maybe_update_swa_model(iter_total)
        self.scheduler_step(iter_total, loss)

        if self.is_main_process:
            self.iter_time = time.perf_counter() - self.start_time
            self.total_time += self.iter_time
            avg_iter_time = self.total_time / (iter_total + 1 -
                                               self.start_iter)
            est_time_left = avg_iter_time * \
                (self.total_iter_nums+self.start_iter-iter_total-1) / 3600.0
            info = [
                '[Iteration %05d]' % iter_total,
                'Data time: %.4fs,' % self.data_time,
                'Iter time: %.4fs,' % self.iter_time,
                'Avg iter time: %.4fs,' % avg_iter_time,
                'Time Left %.2fh.' % est_time_left
            ]
            print(' '.join(info))

        # Release some GPU memory and ensure same GPU usage in the consecutive iterations according to
        # https://discuss.pytorch.org/t/gpu-memory-consumption-increases-while-training/2770
        del volume, target, pred, weight, loss, losses_vis
Esempio n. 27
0
def determine(args):
    def get_or_else(val, default):
        if val is not None:
            return val
        return default

    def generate_random(n):
        return ''.join(random.choice(string.ascii_lowercase) for _ in range(n))

    description_fuscated = ''.join(ch for ch in args.description
                                   if ch.isalnum()).lower()
    base_name = int((args.max_len_name * 3) / 4)
    rest = int(args.max_len_name - base_name)
    custom_name = (''.join(description_fuscated.split(' '))[0:base_name]
                   ) + "-" + generate_random(rest)
    model_name = get_or_else(args.model_name, custom_name)
    docker_image = get_or_else(args.docker_image, custom_name)
    description = args.description

    available_gpus = GPUtil.getAvailable(order='first',
                                         limit=1,
                                         maxLoad=0.2,
                                         maxMemory=0.2,
                                         includeNan=False)
    if len(available_gpus) == 0:
        print("Currently there is no gpu available, printing load")
        GPUtil.showUtilization()
        exit(EXIT_CODE)
    id_free_gpu = available_gpus[0]

    print('Determined starting arguments:')
    print(f'description = [{description}]')
    print(f'model_name = [{model_name}]')
    print(f'docker_image = [{docker_image}]')
    print(f'id_free_gpu = [{id_free_gpu}]')
    GPUtil.showUtilization()

    if not query_yes_no('Do you accept these?'):
        print("Right, start again [might want to --help then]")
        exit(EXIT_CODE)

    return (description_fuscated + '.' +
            model_name), model_name, docker_image, id_free_gpu
 def get_gpu_memory_usage():
     while True:
         try:
             gpu_usage = GPUtil.showUtilization()
             COMPONENTS_INFO["gpu_memory_usage"][
                 "message"] = f"Using: {'%.2f' % float(gpu_usage.memoryUsed / 1000)}GB ({'%.1f' % (float(gpu_usage.memoryUsed / gpu_usage.memoryTotal) * 100)}%)"
         except UnboundLocalError:
             COMPONENTS_INFO["gpu_memory_usage"]["message"] = "Using: None"
             break
         time.sleep(config.SLEEP_IN_SEC)
Esempio n. 29
0
def run(clock):
    while True:
        print("Time: " + time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))
        p = psutil.Process(int(pid))
        pinfo = p.as_dict(ad_value=psutil.AccessDenied)

        print bcolors.WARNING + "**Process CPU Info**" + bcolors.ENDC
        print "Memory: " + str_ntuple(pinfo['memory_full_info'],
                                      bytes2human=True)
        print "Memory %: " + str(p.memory_percent())
        print "cpu time: " + str_ntuple(pinfo['cpu_times'])
        print "cpu %: " + str(p.cpu_percent(interval=1))
        print bcolors.WARNING + "**Process GPU Info**" + bcolors.ENDC
        print "gpu index: " + gpu_index(pid)
        print "process type: " + gpu_type(pid)
        print "gpu Memory: " + gpu_mem(pid)
        print bcolors.WARNING + "**Global GPU Info**" + bcolors.ENDC
        GPUtil.showUtilization()
        print ""
        time.sleep(clock)
 def get_gpu_usage():
     while True:
         try:
             gpu_usage = GPUtil.showUtilization()
             COMPONENTS_INFO["gpu_usage"][
                 "message"] = f"GPU - Using: {'%.1f' % float(gpu_usage.load * 100)}%"
         except UnboundLocalError:
             print(termcolor.colored("Can't find GPU\n", "red"))
             COMPONENTS_INFO["gpu_usage"]["message"] = "GPU - Using: None"
             break
         time.sleep(config.SLEEP_IN_SEC)