def _cat_and_remove(tgt_filename, num_workers): with file_io.open(tgt_filename, 'w') as writer: for idx in range(num_workers): src_filename = tgt_filename + str(idx) with file_io.open(src_filename, 'r') as reader: PathManager.copyfileobj(reader, writer) PathManager.rm(src_filename)
def _concate(_tgt_filename, num_workers, tgt_filename): src_filenames = [ _tgt_filename + str(idx) for idx in range(num_workers) ] with file_io.open(tgt_filename, 'w') as writer: for _src_fl in src_filenames: with file_io.open(_src_fl, 'r') as reader: shutil.copyfileobj(reader, writer) PathManager.rm(_src_fl)
def merge_attr_files(flatten_dir, lang, mode, attrs): """shell cat""" def _merge_files(src_files, tgt_file): with file_io.open(tgt_file, 'w') as writer: for src_fl in src_files: with file_io.open(src_fl, 'r') as reader: shutil.copyfileobj(reader, writer) def _get_file_idx(filename): filename = os.path.split(filename)[-1] idx = int(filename[:str.rfind(filename, '.json')]) return idx for attr in attrs: attr_files = PathManager.ls( os.path.join(flatten_dir, lang, mode, attr, '*.jsonl')) attr_files = sorted(attr_files, key=_get_file_idx) assert len(attr_files) > 0, RuntimeError( 'Attribute({}) files do not exist.'.format(attr)) dest_file = os.path.join(flatten_dir, lang, '{}.{}'.format(mode, attr)) _merge_files(attr_files, dest_file) PathManager.rm(os.path.join(flatten_dir, lang, mode))
def single_main(args, init_distributed=False): assert args['dataset']['max_tokens'] is not None or args['dataset']['max_sentences'] is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' metrics.reset() # 0. Initialize CUDA and distributed training if torch.cuda.is_available() and not args['common']['cpu']: torch.cuda.set_device(args['distributed_training']['device_id']) set_seed.set_seed(args['common']['seed']) if init_distributed: args['distributed_training'][ 'distributed_rank'] = distributed_utils.distributed_init(args) # Verify checkpoint directory if distributed_utils.is_master(args): save_dir = args['checkpoint']['save_dir'] checkpoint_utils.verify_checkpoint_directory(save_dir) PathManager.rm(os.path.join( save_dir, '*.pt')) # this code will remove pre-trained models # 1. Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # 2. Load valid dataset (we load training data below, based on the latest checkpoint) task.load_dataset(args['dataset']['valid_subset'], combine=False, epoch=1) # 3. Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) LOGGER.info(model) LOGGER.info('model {}, criterion {}'.format(args['model']['arch'], criterion.__class__.__name__)) LOGGER.info('num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # 4. Build trainer trainer = Trainer(args, task, model, criterion) LOGGER.info('training on {} GPUs'.format( args['distributed_training']['distributed_world_size'])) LOGGER.info( 'max tokens per GPU = {} and max sentences per GPU = {}'.format( args['dataset']['max_tokens'], args['dataset']['max_sentences'], )) # 5. Load the latest checkpoint if one is available and restore the corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer, combine=False) # 6. Train until the learning rate gets too small max_epoch = args['optimization']['max_epoch'] or math.inf max_update = args['optimization']['max_update'] or math.inf lr = trainer.get_lr() train_meter = meters.StopwatchMeter() train_meter.start() valid_subsets = args['dataset']['valid_subset'].split(',') while (lr > args['optimization']['min_lr'] and epoch_itr.next_epoch_idx <= max_epoch and trainer.get_num_updates() < max_update): # train for one epoch train(args, trainer, task, epoch_itr) if not args['dataset']['disable_validation'] and epoch_itr.epoch % args[ 'dataset']['validate_interval'] == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) else: valid_losses = [None] # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args['checkpoint']['save_interval'] == 0: checkpoint_utils.save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) # early stop if should_stop_early(args, valid_losses[0]): LOGGER.info( 'early stop since valid performance hasn\'t improved for last {} runs' .format(args['checkpoint']['patience'])) break epoch_itr = trainer.get_train_iterator( epoch_itr.next_epoch_idx, combine=False, # TODO to be checked # sharded data: get train iterator for next epoch load_dataset=(os.pathsep in args['task']['data']), ) train_meter.stop() LOGGER.info('done training in {:.1f} seconds'.format(train_meter.sum))
from ncc import ( __TREE_SITTER_LIBS_DIR__, LOGGER, ) from ncc.utils.path_manager import PathManager # define your config YOUR_LANGUAGE = 'csharp' TREE_SITTER_LIB_URL = 'https://github.com/tree-sitter/tree-sitter-c-sharp/archive/master.zip' os.makedirs(__TREE_SITTER_LIBS_DIR__, exist_ok=True) so_file = os.path.join(__TREE_SITTER_LIBS_DIR__, f'{YOUR_LANGUAGE}.so') # download lib_filename = os.path.join(__TREE_SITTER_LIBS_DIR__, f'{YOUR_LANGUAGE}.zip') if PathManager.exists(lib_filename): PathManager.rm(lib_filename) LOGGER.info( f"Download TreeSitter-{YOUR_LANGUAGE}-Parser from {TREE_SITTER_LIB_URL}") wget.download(TREE_SITTER_LIB_URL, lib_filename) # decompress decompress_dir = os.path.join(__TREE_SITTER_LIBS_DIR__, 'tmp') with zipfile.ZipFile(lib_filename, 'r') as zip_file: zip_file.extractall(path=decompress_dir) lib_dir = os.path.join(decompress_dir, os.listdir(decompress_dir)[0]) # build LOGGER.info( f"Build {YOUR_LANGUAGE}.so, and save it at {__TREE_SITTER_LIBS_DIR__}") Language.build_library( # your language parser file, we recommend buidl *.so file for each language