def read_raw_parallel_lines(src_path: Union[str, Path], tgt_path: Union[str, Path]) \ -> Iterator[RawRecord]: with IO.reader(src_path) as src_lines, IO.reader( tgt_path) as tgt_lines: # if you get an exception here --> files have un equal number of lines recs = ((src.strip(), tgt.strip()) for src, tgt in zip_longest(src_lines, tgt_lines)) recs = ((src, tgt) for src, tgt in recs if src and tgt) yield from recs
def __init__(self, models: List[Path], exp: Union[Path, TranslationExperiment], lr: float = 1e-4, smoothing=0.1): if isinstance(exp, Path): exp = TranslationExperiment(exp) self.w_file = exp.work_dir / f'combo-weights.yml' wt = None if self.w_file.exists(): with IO.reader(self.w_file) as rdr: combo_spec = yaml.load(rdr) weights = combo_spec['weights'] assert len(weights) == len( models) # same models as before: no messing allowed model_path_strs = [str(m) for m in models] for m in model_path_strs: assert m in weights, f'{m} not found in weights file.' wt = [weights[str(m)] for m in model_path_strs] log.info(f"restoring previously stored weights {wt}") from rtg.module.decoder import load_models combo = Combo(load_models(models, exp), model_paths=models, w=wt) self.combo = combo.to(device) self.exp = exp self.optim = torch.optim.Adam(combo.parameters(), lr=lr) self.criterion = LabelSmoothing(vocab_size=combo.vocab_size, padding_idx=PAD_TOK_IDX, smoothing=smoothing)
def read_raw_mono_recs(path: Union[str, Path], truncate: bool, max_len: int, tokenizer): with IO.reader(path) as lines: recs = (tokenizer(line.strip()) for line in lines if line.strip()) if truncate: recs = (rec[:max_len] for rec in recs) else: # Filter out longer sentences recs = (rec for rec in recs if 0 < len(rec) <= max_len) yield from recs
def _read_vocab(path: Path) -> List[str]: with IO.reader(path) as rdr: vocab = [line.strip().split()[0] for line in rdr] if do_clean: # sentence piece starts with '▁' character vocab = [ word[1:] if word[0] == '▁' else word for word in vocab ] return vocab
def shell_pipe(cls, cmd_line, inp, out): """ :param cmd_line: shell commandlines :param inp: input file, to read records :param out: output file to store records :return: """ log.info("Shell cmd:: {cmd_line}") with IO.reader(inp) as rdr, IO.writer(out) as wtr: proc = subprocess.Popen(cmd_line, stdin=rdr, stdout=wtr, shell=True) proc.wait() log.info("Shell cmd:: Done")
def read_all(self) -> Iterator[IdExample]: with IO.reader(self.path) as lines: recs = (line.split('\t') for line in lines) for idx, rec in enumerate(recs): x = self._parse(rec[0].strip()) y = self._parse(rec[1].strip()) if len(rec) > 1 else None if self.truncate: # truncate long recs x = x[:self.max_src_len] y = y if y is None else y[:self.max_tgt_len] elif len(x) > self.max_src_len or (0 if y is None else len(y)) > self.max_tgt_len: continue # skip long recs if not x or (y is not None and len(y) == 0): # empty on one side log.warning( f"Ignoring an empty record x:{len(x)} y:{len(y)}") continue yield IdExample(x, y, id=idx)
def validate_args(args, exp: Experiment): if not args.pop('skip_check'): # if --skip-check is not requested assert exp.has_prepared(), \ f'Experiment dir {exp.work_dir} is not ready to train. Please run "prep" sub task' assert exp.has_trained(), \ f'Experiment dir {exp.work_dir} is not ready to decode.' \ f' Please run "train" sub task or --skip-check to ignore this' weights_file = exp.work_dir / 'combo-weights.yml' if not args.get('sys_comb') and weights_file.exists(): log.warning("Found default combo weights, switching to combo mode") args['sys_comb'] = weights_file if args.get("sys_comb"): with IO.reader(args['sys_comb']) as fh: weights = yaml.load(fh)['weights'] args['model_path'], args['weights'] = zip(*weights.items()) for model in args['model_path']: assert Path(model).exists(), model assert abs(sum(args['weights']) - 1) < 1e-3, \ f'Weights from --sys-comb file should sum to 1.0, given={args["weights"]}'
def __init__(self, path: Union[str, Path]): with IO.reader(path) as rdr: data = yaml.load(rdr) hub_api = self.load_hub_model(data['model_id']) # these are for XML-R wiz RoBERTa from fairseq ; generalize it for other models later self.bpe = hub_api.bpe self.tok2idx = { tok: new_idx for tok, (new_idx, old_idx) in data['mapping'].items() } self.idx2tok = list( sorted(self.tok2idx.keys(), key=self.tok2idx.get, reverse=False)) assert len(self.idx2tok) == len(self.tok2idx) for tok, idx in self.reserved(): # reserved are reserved assert self.tok2idx[tok] == idx assert self.idx2tok[idx] == tok self.new_idx2old_idx = { new_idx: old_idx for tok, (new_idx, old_idx) in data['mapping'].items() }
def read_tsv(path: str): assert os.path.exists(path) with IO.reader(path) as f: yield from (line.split('\t') for line in f)
def __init__(self, exp: Experiment, model: Optional[NMTModel] = None, model_factory: Optional[Callable] = None, optim: str = 'ADAM', **optim_args): self.start_step = 0 self.last_step = -1 self.exp = exp optim_state = None if model: self.model = model else: args = exp.model_args assert args assert model_factory self.model, args = model_factory(exp=exp, **args) exp.model_args = args last_model, self.last_step = self.exp.get_last_saved_model() if last_model: self.start_step = self.last_step + 1 log.info( f"Resuming training from step:{self.start_step}, model={last_model}" ) state = torch.load(last_model) model_state = state[ 'model_state'] if 'model_state' in state else state if 'optim_state' in state: optim_state = state['optim_state'] self.model.load_state_dict(model_state) else: log.info( "No earlier check point found. Looks like this is a fresh start" ) # making optimizer optim_args['lr'] = optim_args.get('lr', 0.1) optim_args['betas'] = optim_args.get('betas', [0.9, 0.98]) optim_args['eps'] = optim_args.get('eps', 1e-9) warmup_steps = optim_args.pop('warmup_steps', 8000) self._smoothing = optim_args.pop('label_smoothing', 0.1) constant = optim_args.pop('constant', 2) self.model = self.model.to(device) inner_opt = Optims[optim].new(self.model.parameters(), **optim_args) if optim_state: log.info("restoring optimizer state from checkpoint") try: inner_opt.load_state_dict(optim_state) except Exception: log.exception("Unable to restore optimizer, skipping it.") self.opt = NoamOpt(self.model.model_dim, constant, warmup_steps, inner_opt, step=self.start_step) optim_args.update( dict(warmup_steps=warmup_steps, label_smoothing=self._smoothing, constant=constant)) if self.exp.read_only: self.tbd = NoOpSummaryWriter() else: self.tbd = SummaryWriter(log_dir=str(exp.work_dir / 'tensorboard')) self.exp.optim_args = optim, optim_args if not self.exp.read_only: self.exp.persist_state() self.samples = None if exp.samples_file.exists(): with IO.reader(exp.samples_file) as f: self.samples = [line.strip().split('\t') for line in f] log.info(f"Found {len(self.samples)} sample records") if self.start_step == 0: for samp_num, sample in enumerate(self.samples): self.tbd.add_text(f"sample/{samp_num}", " || ".join(sample), 0) from rtg.module.decoder import Decoder self.decoder = Decoder.new(self.exp, self.model) if self.start_step == 0: self.init_embeddings() self.model = self.model.to(device)
def load_conf(inp: Union[str, Path]): with IO.reader(inp) as fh: return yaml.load(fh)
def __init__(self, exp: Experiment, model: Optional[NMTModel] = None, model_factory: Optional[Callable] = None, optim: str = 'ADAM', **optim_args): self.last_step = -1 self.exp = exp optim_state = None if model: self.model = model else: args = exp.model_args assert args assert model_factory self.model, args = model_factory(exp=exp, **args) exp.model_args = args last_model, self.last_step = self.exp.get_last_saved_model() if last_model: log.info( f"Resuming training from step:{self.last_step}, model={last_model}" ) state = torch.load(last_model, map_location=device) model_state = state[ 'model_state'] if 'model_state' in state else state if 'optim_state' in state: optim_state = state['optim_state'] self.model.load_state_dict(model_state) if 'amp_state' in state and dtorch.fp16: log.info("Restoring AMP state") dtorch._scaler.load_state_dict(state['amp_state']) else: log.info( "No earlier check point found. Looks like this is a fresh start" ) # optimizer : default args for missing fields for k, v in self.default_optim_args.items(): optim_args[k] = optim_args.get(k, v) self.n_gpus = torch.cuda.device_count() self.device_ids = list(range(self.n_gpus)) inner_opt_args = { k: optim_args[k] for k in ['lr', 'betas', 'eps', 'weight_decay', 'amsgrad'] } self.core_model = self.model.to(device) trainable_params = self.exp.config['optim'].get('trainable', {}) if trainable_params: if drtorch.is_distributed: # model is wrapped in DP or DistributedDP log.warning( f">> Using more than 1 GPU with 'trainable' params is NOT tested" ) trainable_params = self.core_model.get_trainable_params( include=trainable_params.get('include'), exclude=trainable_params.get('exclude')) else: trainable_params = self.model.parameters() inner_opt = Optims[optim].new(trainable_params, **inner_opt_args) self.model = dtorch.maybe_distributed(self.core_model) if optim_state: log.info("restoring optimizer state from checkpoint") try: inner_opt.load_state_dict(optim_state) except Exception: log.exception("Unable to restore optimizer, skipping it.") self.opt = NoamOpt(self.core_model.model_dim, optim_args['constant'], optim_args['warmup_steps'], inner_opt, step=self.start_step, inv_sqrt=optim_args['inv_sqrt']) if self.exp.read_only: self.tbd = NoOpSummaryWriter() else: self.tbd = SummaryWriter(log_dir=str(exp.work_dir / 'tensorboard')) self.exp.optim_args = optim, optim_args if not self.exp.read_only: self.exp.persist_state() self.samples = None if exp.samples_file and exp.samples_file.exists(): with IO.reader(exp.samples_file) as f: self.samples = [line.strip().split('\t') for line in f] log.info(f"Found {len(self.samples)} sample records") if self.start_step == 0: for samp_num, sample in enumerate(self.samples): self.tbd.add_text(f"sample/{samp_num}", " || ".join(sample), 0) from rtg.module.decoder import Decoder self.decoder = Decoder.new(self.exp, self.core_model) if self.start_step <= 1: self.maybe_init_model() self.criterion = self.create_criterion(optim_args['criterion'])