Exemple #1
0
def main(_):
    FLAGS.torch_only = True
    melt.init()
    fit = melt.get_fit()

    FLAGS.eval_batch_size = 512 * FLAGS.valid_multiplier

    model_name = FLAGS.model
    model = getattr(base, model_name)()

    loss_fn = nn.BCEWithLogitsLoss()

    td = TextDataset()
    train_files = gezi.list_files('../input/train/*')
    train_ds = get_dataset(train_files, td)

    import multiprocessing
    #--easy to be Killed .. if large workers
    num_threads = int(multiprocessing.cpu_count() * 0.3)
    logging.info('num_threads as multiprocessing.cpu_count', num_threads)

    num_threads = 12
    train_dl = DataLoader(train_ds,
                          FLAGS.batch_size,
                          shuffle=True,
                          num_workers=num_threads,
                          collate_fn=lele.DictPadCollate())
    #logging.info('num train examples', len(train_ds), len(train_dl))
    valid_files = gezi.list_files('../input/valid/*')
    valid_ds = get_dataset(valid_files, td)
    valid_dl = DataLoader(valid_ds,
                          FLAGS.eval_batch_size,
                          collate_fn=lele.DictPadCollate(),
                          num_workers=num_threads)
    valid_dl2 = DataLoader(valid_ds,
                           FLAGS.batch_size,
                           collate_fn=lele.DictPadCollate(),
                           num_workers=num_threads)
    #logging.info('num valid examples', len(valid_ds), len(valid_dl))

    fit(
        model,
        loss_fn,
        dataset=train_dl,
        valid_dataset=valid_dl,
        valid_dataset2=valid_dl2,
        eval_fn=ev.evaluate,
        valid_write_fn=ev.valid_write,
        #write_valid=FLAGS.write_valid)
        write_valid=False,
    )
Exemple #2
0
def main(_):
    FLAGS.torch_only = True
    #FLAGS.valid_input = None
    melt.init()
    fit = melt.get_fit()

    FLAGS.eval_batch_size = 512 * FLAGS.valid_multiplier

    model_name = FLAGS.model
    model = getattr(base, model_name)()

    loss_fn = nn.BCEWithLogitsLoss()

    td = text_dataset.Dataset()
    train_files = gezi.list_files(FLAGS.train_input)
    train_ds = get_dataset(train_files, td)

    ## speed up a bit with pin_memory==True
    ## num_workers 1 is very slow especially for validation, seems 4 workers is enough, large number dangerous sometimes 12 ok sometimes hang, too much resource seems

    #kwargs = {'num_workers': 12, 'pin_memory': True, 'collate_fn': lele.DictPadCollate()}
    #kwargs = {'num_workers': 6, 'pin_memory': True, 'collate_fn': lele.DictPadCollate()}
    kwargs = {
        'num_workers': 8,
        'pin_memory': True,
        'collate_fn': lele.DictPadCollate()
    }
    ## for 1 gpu, set > 8 might startup very slow
    #num_workers = int(8 / hvd.size())
    # num_workers = 0
    # pin_memory = False
    #kwargs = {'num_workers': num_workers, 'pin_memory': pin_memory, 'collate_fn': lele.DictPadCollate()}

    train_dl = DataLoader(train_ds, FLAGS.batch_size, shuffle=True, **kwargs)

    #kwargs['num_workers'] = max(1, num_workers)
    #logging.info('num train examples', len(train_ds), len(train_dl))

    if FLAGS.valid_input:
        valid_files = gezi.list_files(FLAGS.valid_input)
        valid_ds = get_dataset(valid_files, td)
        valid_dl = DataLoader(valid_ds, FLAGS.eval_batch_size, **kwargs)

        #kwargs['num_workers'] = max(1, num_workers)
        valid_dl2 = DataLoader(valid_ds, FLAGS.batch_size, **kwargs)
        #logging.info('num valid examples', len(valid_ds), len(valid_dl))

    fit(
        model,
        loss_fn,
        dataset=train_dl,
        valid_dataset=valid_dl,
        valid_dataset2=valid_dl2,
        eval_fn=ev.evaluate,
        valid_write_fn=ev.valid_write,
        #write_valid=FLAGS.write_valid)
        write_valid=False,
    )
Exemple #3
0
def main(_):
  FLAGS.torch_only = True
  melt.init()
  fit = melt.get_fit()

  FLAGS.eval_batch_size = 512 * FLAGS.valid_multiplier

  model_name = FLAGS.model
  model = getattr(base, model_name)() 

  model = model.cuda()

  loss_fn = nn.BCEWithLogitsLoss()

  td = text_dataset.Dataset()

  train_files = gezi.list_files('../input/train/*')
  train_ds = get_dataset(train_files, td)
  
  #kwargs = {'num_workers': 4, 'pin_memory': True, 'collate_fn': lele.DictPadCollate()}
  #kwargs = {'num_workers': 0, 'pin_memory': True, 'collate_fn': lele.DictPadCollate()}
  #kwargs = {'num_workers': 4, 'pin_memory': True, 'collate_fn': lele.DictPadCollate()}
  
  num_workers = 1
  kwargs = {'num_workers': num_workers, 'pin_memory': False, 'collate_fn': lele.DictPadCollate()}

  train_sampler = train_ds
  train_sampler = torch.utils.data.distributed.DistributedSampler(
    train_ds, num_replicas=hvd.size(), rank=hvd.rank())
  
  train_dl = DataLoader(train_ds, FLAGS.batch_size, sampler=train_sampler, **kwargs)
  
  valid_files = gezi.list_files('../input/valid/*')
  valid_ds = get_dataset(valid_files, td)

  kwargs['num_workers'] = 1
  # support shuffle=False from version 1.2
  valid_sampler = torch.utils.data.distributed.DistributedSampler(
      valid_ds, num_replicas=hvd.size(), rank=hvd.rank(), shuffle=False)

  kwargs['num_workers'] = 1
  valid_sampler2 = torch.utils.data.distributed.DistributedSampler(
      valid_ds, num_replicas=hvd.size(), rank=hvd.rank(), shuffle=False)
  
  valid_dl = DataLoader(valid_ds, FLAGS.eval_batch_size, sampler=valid_sampler, **kwargs)
  valid_dl2 = DataLoader(valid_ds, FLAGS.batch_size, sampler=valid_sampler2, **kwargs)

  fit(model,  
      loss_fn,
      dataset=train_dl,
      valid_dataset=valid_dl,
      valid_dataset2=valid_dl2,
      eval_fn=ev.evaluate,
      valid_write_fn=ev.valid_write,
      #write_valid=FLAGS.write_valid)   
      write_valid=False,
     )
Exemple #4
0
def main(_):
  FLAGS.torch_only = True
  
  melt.init()
  #fit = melt.get_fit()

  FLAGS.eval_batch_size = 512 * FLAGS.valid_multiplier
  FLAGS.eval_batch_size = 512

  model_name = FLAGS.model
  model = getattr(base, model_name)() 

  model = model.cuda()

  loss_fn = nn.BCEWithLogitsLoss()

  td = text_dataset.Dataset()

  train_files = gezi.list_files('../input/train/*')
  train_ds = get_dataset(train_files, td)
  
  #kwargs = {'num_workers': 4, 'pin_memory': True, 'collate_fn': lele.DictPadCollate()}
  #num_workers = int(16 / hvd.size())  
  num_workers = 1  # set to 1 2 min to start might just set to 0 for safe
  num_workers = 0 # 设置0 速度比1慢很多   启动都需要1分多。。
  # pin_memory 影响不大 单gpu提升速度一点点 多gpu 主要是 num_workers  影响资源占有。。有可能启动不起来
  # 多gpu pin_memory = False 反而速度更快。。
  #kwargs = {'num_workers': num_workers, 'pin_memory': True, 'collate_fn': lele.DictPadCollate()}  
  kwargs = {'num_workers': 1, 'pin_memory': False, 'collate_fn': lele.DictPadCollate()}

  train_sampler = train_ds
  train_sampler = torch.utils.data.distributed.DistributedSampler(
    train_ds, num_replicas=hvd.size(), rank=hvd.rank())
  
  train_dl = DataLoader(train_ds, FLAGS.batch_size, sampler=train_sampler, **kwargs)
  
  valid_files = gezi.list_files('../input/valid/*')
  valid_ds = get_dataset(valid_files, td)

  # support shuffle=False from version 1.2
  valid_sampler = torch.utils.data.distributed.DistributedSampler(
      valid_ds, num_replicas=hvd.size(), rank=hvd.rank(), shuffle=False)

  # valid_sampler2 = torch.utils.data.distributed.DistributedSampler(
  #     valid_ds, num_replicas=hvd.size(), rank=hvd.rank(), shuffle=False)
  
  valid_dl = DataLoader(valid_ds, FLAGS.eval_batch_size, sampler=valid_sampler, **kwargs)
  
  #valid_dl2 = DataLoader(valid_ds, FLAGS.batch_size, sampler=valid_sampler2, **kwargs)


  optimizer = optim.Adamax(model.parameters(), lr=0.1)
  #optimizer = optim.SGD(model.parameters(), lr=0.1)
  hvd.broadcast_parameters(model.state_dict(), root_rank=0)
  hvd.broadcast_optimizer_state(optimizer, root_rank=0)

  optimizer = hvd.DistributedOptimizer(optimizer,
                                       named_parameters=model.named_parameters())

  for epoch in range(2):
    train(epoch, model, loss_fn, train_dl, optimizer)
    test(model, loss_fn, valid_dl)
Exemple #5
0
    if not isinstance(x, dict):
        x = torch_(x)
    else:
        for key in x:
            x[key] = torch_(x[key])
    if y is None:
        return x
    else:
        return x, y


files = gezi.list_files('../input/train.small/*')
td = TD()
ds = get_dataset(files, td)
dl = DataLoader(ds, 2, collate_fn=lele.DictPadCollate())
print(len(ds), len(dl), len(dl.dataset))
for i, (x, y) in enumerate(dl):
    print(i, x['id'][0], x['value'][0])
    # #print('--------------', d)
    # print(x['index'].shape)
    # print(x['field'].shape)
    # print(x['value'].shape)
    # print(x['id'].shape)
    # print(y.shape)
    #print(x)
    # for key in x:
    #   print(key, type(x[key][0]), type(x[key]), x[key][0].dtype)

    #x, y = to_torch(x, y)
    # if i == 2:
Exemple #6
0
#import tensorflow as tf 

sampler = ds
sampler = torch.utils.data.distributed.DistributedSampler(
            ds, num_replicas=hvd.size(), rank=hvd.rank())

# sampler = torch.utils.data.RandomSampler(sampler)

#sampler = torch.utils.data.RandomSampler(ds)
# # seems here shuffle not work..
# sampler = torch.utils.data.distributed.DistributedSampler(
#             sampler, num_replicas=hvd.size(), rank=hvd.rank(),
#             shuffle=True)

#collate_fn = lele.DictPadCollate2()
collate_fn = lele.DictPadCollate()

dl = DataLoader(ds, 2, 
                collate_fn=collate_fn,
                sampler=sampler)

print(len(ds), len(dl), len(dl.dataset))

for epoch in range(2):
  if dl.sampler and hasattr(dl.sampler, 'set_epoch'):
    dl.sampler.set_epoch(epoch)
  for i, (x, y) in enumerate(dl):
    for j in range(len(y)):
      print('epoch', epoch, 'i', i, 'j', j, x['id'][j])

    # #print('--------------', d)