def main(_): FLAGS.train_input = '../input/train.small' FLAGS.valid_input = '../input/train.small' FLAGS.batch_size = 4 FLAGS.feat_file_path = '../input/feature_index' FLAGS.field_file_path = '../input/feat_fields.old' melt.init() dataset = Dataset('train') #dataset = Dataset('valid') iter = dataset.make_batch() op = iter.get_next() print('---batch_size', dataset.batch_size, FLAGS.batch_size) sess = melt.get_session() print('----sess', sess) if not FLAGS.use_horovod: for epoch in range(2): for i in range(3): batch = sess.run(op) print(epoch, i, len(batch[0]['id']), batch[0]['id']) else: for epoch in range(2): for i in range(3): batch = sess.run(op) print(epoch, i, len(batch[0]['id']), batch[0]['id'])
def main(_): FLAGS.torch_only = True #FLAGS.valid_input = None melt.init() fit = melt.get_fit() FLAGS.eval_batch_size = 512 * FLAGS.valid_multiplier model_name = FLAGS.model model = getattr(base, model_name)() loss_fn = nn.BCEWithLogitsLoss() td = text_dataset.Dataset() train_files = gezi.list_files(FLAGS.train_input) train_ds = get_dataset(train_files, td) ## speed up a bit with pin_memory==True ## num_workers 1 is very slow especially for validation, seems 4 workers is enough, large number dangerous sometimes 12 ok sometimes hang, too much resource seems #kwargs = {'num_workers': 12, 'pin_memory': True, 'collate_fn': lele.DictPadCollate()} #kwargs = {'num_workers': 6, 'pin_memory': True, 'collate_fn': lele.DictPadCollate()} kwargs = { 'num_workers': 8, 'pin_memory': True, 'collate_fn': lele.DictPadCollate() } ## for 1 gpu, set > 8 might startup very slow #num_workers = int(8 / hvd.size()) # num_workers = 0 # pin_memory = False #kwargs = {'num_workers': num_workers, 'pin_memory': pin_memory, 'collate_fn': lele.DictPadCollate()} train_dl = DataLoader(train_ds, FLAGS.batch_size, shuffle=True, **kwargs) #kwargs['num_workers'] = max(1, num_workers) #logging.info('num train examples', len(train_ds), len(train_dl)) if FLAGS.valid_input: valid_files = gezi.list_files(FLAGS.valid_input) valid_ds = get_dataset(valid_files, td) valid_dl = DataLoader(valid_ds, FLAGS.eval_batch_size, **kwargs) #kwargs['num_workers'] = max(1, num_workers) valid_dl2 = DataLoader(valid_ds, FLAGS.batch_size, **kwargs) #logging.info('num valid examples', len(valid_ds), len(valid_dl)) fit( model, loss_fn, dataset=train_dl, valid_dataset=valid_dl, valid_dataset2=valid_dl2, eval_fn=ev.evaluate, valid_write_fn=ev.valid_write, #write_valid=FLAGS.write_valid) write_valid=False, )
def main(_): FLAGS.torch_only = True melt.init() fit = melt.get_fit() FLAGS.eval_batch_size = 512 * FLAGS.valid_multiplier model_name = FLAGS.model model = getattr(base, model_name)() model = model.cuda() loss_fn = nn.BCEWithLogitsLoss() td = text_dataset.Dataset() train_files = gezi.list_files('../input/train/*') train_ds = get_dataset(train_files, td) #kwargs = {'num_workers': 4, 'pin_memory': True, 'collate_fn': lele.DictPadCollate()} #kwargs = {'num_workers': 0, 'pin_memory': True, 'collate_fn': lele.DictPadCollate()} #kwargs = {'num_workers': 4, 'pin_memory': True, 'collate_fn': lele.DictPadCollate()} num_workers = 1 kwargs = {'num_workers': num_workers, 'pin_memory': False, 'collate_fn': lele.DictPadCollate()} train_sampler = train_ds train_sampler = torch.utils.data.distributed.DistributedSampler( train_ds, num_replicas=hvd.size(), rank=hvd.rank()) train_dl = DataLoader(train_ds, FLAGS.batch_size, sampler=train_sampler, **kwargs) valid_files = gezi.list_files('../input/valid/*') valid_ds = get_dataset(valid_files, td) kwargs['num_workers'] = 1 # support shuffle=False from version 1.2 valid_sampler = torch.utils.data.distributed.DistributedSampler( valid_ds, num_replicas=hvd.size(), rank=hvd.rank(), shuffle=False) kwargs['num_workers'] = 1 valid_sampler2 = torch.utils.data.distributed.DistributedSampler( valid_ds, num_replicas=hvd.size(), rank=hvd.rank(), shuffle=False) valid_dl = DataLoader(valid_ds, FLAGS.eval_batch_size, sampler=valid_sampler, **kwargs) valid_dl2 = DataLoader(valid_ds, FLAGS.batch_size, sampler=valid_sampler2, **kwargs) fit(model, loss_fn, dataset=train_dl, valid_dataset=valid_dl, valid_dataset2=valid_dl2, eval_fn=ev.evaluate, valid_write_fn=ev.valid_write, #write_valid=FLAGS.write_valid) write_valid=False, )
def main(_): FLAGS.torch_only = True melt.init() fit = melt.get_fit() FLAGS.eval_batch_size = 512 * FLAGS.valid_multiplier model_name = FLAGS.model model = getattr(base, model_name)() loss_fn = nn.BCEWithLogitsLoss() td = TextDataset() train_files = gezi.list_files('../input/train/*') train_ds = get_dataset(train_files, td) import multiprocessing #--easy to be Killed .. if large workers num_threads = int(multiprocessing.cpu_count() * 0.3) logging.info('num_threads as multiprocessing.cpu_count', num_threads) num_threads = 12 train_dl = DataLoader(train_ds, FLAGS.batch_size, shuffle=True, num_workers=num_threads, collate_fn=lele.DictPadCollate()) #logging.info('num train examples', len(train_ds), len(train_dl)) valid_files = gezi.list_files('../input/valid/*') valid_ds = get_dataset(valid_files, td) valid_dl = DataLoader(valid_ds, FLAGS.eval_batch_size, collate_fn=lele.DictPadCollate(), num_workers=num_threads) valid_dl2 = DataLoader(valid_ds, FLAGS.batch_size, collate_fn=lele.DictPadCollate(), num_workers=num_threads) #logging.info('num valid examples', len(valid_ds), len(valid_dl)) fit( model, loss_fn, dataset=train_dl, valid_dataset=valid_dl, valid_dataset2=valid_dl2, eval_fn=ev.evaluate, valid_write_fn=ev.valid_write, #write_valid=FLAGS.write_valid) write_valid=False, )
def main(_): fit = mt.fit config.init() mt.init() strategy = mt.distributed.get_strategy() with strategy.scope(): model = getattr(base, FLAGS.model)() loss_fn = model.get_loss() # 如果用model.get_model 当前不能用model.get_loss 否则tf2 keras # Inputs to eager execution function cannot be Keras symbolic tensors, but found [<tf.Tensor 'Squeeze_6:0' shape=(None,) dtype=int64>, <tf.Tensor 'Squeeze_10:0' shape=(None,) dtype=int64>] if not FLAGS.custom_loss: loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=True) # if tf.__version__ >= '2': # # TODO FIXME has bug for tf1 non eager mode below but eager mode ok # # TF2 may has problem eager input not ok... # dataset = Dataset('train') # dataset.make_batch() # if not dataset.has_varlen_keys: # inputs = dataset.get_inputs() # model = model.get_model(inputs) if not FLAGS.lm_target: callbacks = [] #if tf.__version__ >= '2': # tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=FLAGS.log_dir, profile_batch=(5, 20)) # callbacks = [tensorboard_callback] fit( model, loss_fn=loss_fn, Dataset=Dataset, eval_fn=ev.evaluate, eval_keys=[ 'uid_', 'did_', 'impression_id', 'uid_in_train', 'did_in_train' ], # used in evaluate.py for x out_hook=out_hook, infer_write_fn=infer_write, valid_write_fn=valid_write, callbacks=callbacks, ) else: fit( model, loss_fn=loss_fn, Dataset=Dataset, )
def main(_): fit = melt.fit config.init() melt.init() strategy = melt.distributed.get_strategy() with strategy.scope(): model = getattr(base, FLAGS.model)() # loss_fn = model.get_loss() loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=True) if tf.__version__ >= '2': # TODO FIXME has bug for tf1 non eager mode below but eager mode ok dataset = Dataset('train') dataset.make_batch() if not dataset.has_varlen_keys: inputs = dataset.get_inputs() model = model.get_model(inputs) if not FLAGS.lm_target: callbacks = [] # tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=FLAGS.log_dir, profile_batch=(5, 20)) # callbacks = [tensorboard_callback] fit( model, loss_fn=loss_fn, Dataset=Dataset, eval_fn=ev.evaluate, eval_keys=[ 'uid_', 'did_', 'impression_id', 'uid_in_train', 'did_in_train' ], # used in evaluate.py for x out_hook=out_hook, infer_write_fn=infer_write, valid_write_fn=valid_write, callbacks=callbacks, ) else: fit( model, loss_fn=loss_fn, Dataset=Dataset, )
def main(_): FLAGS.torch_only = True melt.init() fit = melt.get_fit() FLAGS.eval_batch_size = 512 * FLAGS.valid_multiplier model_name = FLAGS.model model = getattr(base, model_name)() loss_fn = nn.BCEWithLogitsLoss() td = TextDataset() train_files = gezi.list_files('../input/train/*') train_ds = get_dataset(train_files, td) train_dl = DataLoader(train_ds, FLAGS.batch_size, shuffle=True, num_workers=12) logging.info('num train examples', len(train_ds), len(train_dl)) valid_files = gezi.list_files('../input/valid/*') valid_ds = get_dataset(valid_files, td) valid_dl = DataLoader(valid_ds, FLAGS.eval_batch_size) valid_dl2 = DataLoader(valid_ds, FLAGS.batch_size) logging.info('num valid examples', len(valid_ds), len(valid_dl)) print(dir(valid_dl)) fit( model, loss_fn, dataset=train_dl, valid_dataset=valid_dl, valid_dataset2=valid_dl2, eval_fn=ev.evaluate, valid_write_fn=ev.valid_write, #write_valid=FLAGS.write_valid) write_valid=False, )
def main(_): FLAGS.torch = True melt.init() fit = melt.get_fit() FLAGS.eval_batch_size = 512 * FLAGS.valid_multiplier print('---------eval_batch_size', FLAGS.eval_batch_size) model_name = FLAGS.model model = getattr(base, model_name)() Dataset = TextDataset if not 'tfrecord' in FLAGS.train_input else TFRecordDataset loss_fn = nn.BCEWithLogitsLoss() fit( model, loss_fn, Dataset, eval_fn=ev.evaluate, valid_write_fn=ev.valid_write, #write_valid=FLAGS.write_valid) write_valid=False)
def main(_): FLAGS.torch_only = True melt.init() #fit = melt.get_fit() FLAGS.eval_batch_size = 512 * FLAGS.valid_multiplier FLAGS.eval_batch_size = 512 model_name = FLAGS.model model = getattr(base, model_name)() model = model.cuda() loss_fn = nn.BCEWithLogitsLoss() td = text_dataset.Dataset() train_files = gezi.list_files('../input/train/*') train_ds = get_dataset(train_files, td) #kwargs = {'num_workers': 4, 'pin_memory': True, 'collate_fn': lele.DictPadCollate()} #num_workers = int(16 / hvd.size()) num_workers = 1 # set to 1 2 min to start might just set to 0 for safe num_workers = 0 # 设置0 速度比1慢很多 启动都需要1分多。。 # pin_memory 影响不大 单gpu提升速度一点点 多gpu 主要是 num_workers 影响资源占有。。有可能启动不起来 # 多gpu pin_memory = False 反而速度更快。。 #kwargs = {'num_workers': num_workers, 'pin_memory': True, 'collate_fn': lele.DictPadCollate()} kwargs = {'num_workers': 1, 'pin_memory': False, 'collate_fn': lele.DictPadCollate()} train_sampler = train_ds train_sampler = torch.utils.data.distributed.DistributedSampler( train_ds, num_replicas=hvd.size(), rank=hvd.rank()) train_dl = DataLoader(train_ds, FLAGS.batch_size, sampler=train_sampler, **kwargs) valid_files = gezi.list_files('../input/valid/*') valid_ds = get_dataset(valid_files, td) # support shuffle=False from version 1.2 valid_sampler = torch.utils.data.distributed.DistributedSampler( valid_ds, num_replicas=hvd.size(), rank=hvd.rank(), shuffle=False) # valid_sampler2 = torch.utils.data.distributed.DistributedSampler( # valid_ds, num_replicas=hvd.size(), rank=hvd.rank(), shuffle=False) valid_dl = DataLoader(valid_ds, FLAGS.eval_batch_size, sampler=valid_sampler, **kwargs) #valid_dl2 = DataLoader(valid_ds, FLAGS.batch_size, sampler=valid_sampler2, **kwargs) optimizer = optim.Adamax(model.parameters(), lr=0.1) #optimizer = optim.SGD(model.parameters(), lr=0.1) hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters()) for epoch in range(2): train(epoch, model, loss_fn, train_dl, optimizer) test(model, loss_fn, valid_dl)
from __future__ import division from __future__ import print_function import sys import os import tensorflow as tf flags = tf.app.flags FLAGS = flags.FLAGS from torch.utils.data import DataLoader import gezi import lele import melt melt.init() #import horovod.tensorflow as hvd #import horovod.tensorflow as hvd # from pyt.dataset import * # from text_dataset import Dataset as TD import numpy as np #import horovod.tensorflow as hvd import horovod.torch as hvd hvd.init() for i in range(5): print(i)