Beispiel #1
0
 def read_raw_parallel_lines(src_path: Union[str, Path], tgt_path: Union[str, Path]) \
         -> Iterator[RawRecord]:
     with IO.reader(src_path) as src_lines, IO.reader(
             tgt_path) as tgt_lines:
         # if you get an exception here --> files have un equal number of lines
         recs = ((src.strip(), tgt.strip())
                 for src, tgt in zip_longest(src_lines, tgt_lines))
         recs = ((src, tgt) for src, tgt in recs if src and tgt)
         yield from recs
Beispiel #2
0
    def __init__(self,
                 models: List[Path],
                 exp: Union[Path, TranslationExperiment],
                 lr: float = 1e-4,
                 smoothing=0.1):
        if isinstance(exp, Path):
            exp = TranslationExperiment(exp)
        self.w_file = exp.work_dir / f'combo-weights.yml'

        wt = None
        if self.w_file.exists():
            with IO.reader(self.w_file) as rdr:
                combo_spec = yaml.load(rdr)
            weights = combo_spec['weights']
            assert len(weights) == len(
                models)  # same models as before: no messing allowed
            model_path_strs = [str(m) for m in models]
            for m in model_path_strs:
                assert m in weights, f'{m} not found in weights file.'
            wt = [weights[str(m)] for m in model_path_strs]
            log.info(f"restoring previously stored weights {wt}")

        from rtg.module.decoder import load_models
        combo = Combo(load_models(models, exp), model_paths=models, w=wt)
        self.combo = combo.to(device)
        self.exp = exp
        self.optim = torch.optim.Adam(combo.parameters(), lr=lr)
        self.criterion = LabelSmoothing(vocab_size=combo.vocab_size,
                                        padding_idx=PAD_TOK_IDX,
                                        smoothing=smoothing)
Beispiel #3
0
 def read_raw_mono_recs(path: Union[str, Path], truncate: bool,
                        max_len: int, tokenizer):
     with IO.reader(path) as lines:
         recs = (tokenizer(line.strip()) for line in lines if line.strip())
         if truncate:
             recs = (rec[:max_len] for rec in recs)
         else:  # Filter out longer sentences
             recs = (rec for rec in recs if 0 < len(rec) <= max_len)
         yield from recs
Beispiel #4
0
 def _read_vocab(path: Path) -> List[str]:
     with IO.reader(path) as rdr:
         vocab = [line.strip().split()[0] for line in rdr]
         if do_clean:
             # sentence piece starts with '▁' character
             vocab = [
                 word[1:] if word[0] == '▁' else word for word in vocab
             ]
         return vocab
Beispiel #5
0
    def shell_pipe(cls, cmd_line, inp, out):
        """

        :param cmd_line: shell commandlines
        :param inp: input file, to read records
        :param out:  output file to store records
        :return:
        """
        log.info("Shell cmd:: {cmd_line}")
        with IO.reader(inp) as rdr, IO.writer(out) as wtr:
            proc = subprocess.Popen(cmd_line,
                                    stdin=rdr,
                                    stdout=wtr,
                                    shell=True)
            proc.wait()
        log.info("Shell cmd:: Done")
Beispiel #6
0
 def read_all(self) -> Iterator[IdExample]:
     with IO.reader(self.path) as lines:
         recs = (line.split('\t') for line in lines)
         for idx, rec in enumerate(recs):
             x = self._parse(rec[0].strip())
             y = self._parse(rec[1].strip()) if len(rec) > 1 else None
             if self.truncate:  # truncate long recs
                 x = x[:self.max_src_len]
                 y = y if y is None else y[:self.max_tgt_len]
             elif len(x) > self.max_src_len or (0 if y is None else
                                                len(y)) > self.max_tgt_len:
                 continue  # skip long recs
             if not x or (y is not None
                          and len(y) == 0):  # empty on one side
                 log.warning(
                     f"Ignoring an empty record  x:{len(x)}    y:{len(y)}")
                 continue
             yield IdExample(x, y, id=idx)
Beispiel #7
0
def validate_args(args, exp: Experiment):
    if not args.pop('skip_check'):  # if --skip-check is not requested
        assert exp.has_prepared(), \
            f'Experiment dir {exp.work_dir} is not ready to train. Please run "prep" sub task'
        assert exp.has_trained(), \
            f'Experiment dir {exp.work_dir} is not ready to decode.' \
            f' Please run "train" sub task or --skip-check to ignore this'

    weights_file = exp.work_dir / 'combo-weights.yml'
    if not args.get('sys_comb') and weights_file.exists():
        log.warning("Found default combo weights, switching to combo mode")
        args['sys_comb'] = weights_file

    if args.get("sys_comb"):
        with IO.reader(args['sys_comb']) as fh:
            weights = yaml.load(fh)['weights']
            args['model_path'], args['weights'] = zip(*weights.items())
            for model in args['model_path']:
                assert Path(model).exists(), model
            assert abs(sum(args['weights']) - 1) < 1e-3, \
                f'Weights from --sys-comb file should sum to 1.0, given={args["weights"]}'
Beispiel #8
0
    def __init__(self, path: Union[str, Path]):
        with IO.reader(path) as rdr:
            data = yaml.load(rdr)
        hub_api = self.load_hub_model(data['model_id'])
        # these are for XML-R wiz RoBERTa from fairseq  ; generalize it for other models later
        self.bpe = hub_api.bpe

        self.tok2idx = {
            tok: new_idx
            for tok, (new_idx, old_idx) in data['mapping'].items()
        }
        self.idx2tok = list(
            sorted(self.tok2idx.keys(), key=self.tok2idx.get, reverse=False))
        assert len(self.idx2tok) == len(self.tok2idx)

        for tok, idx in self.reserved():  # reserved are reserved
            assert self.tok2idx[tok] == idx
            assert self.idx2tok[idx] == tok
        self.new_idx2old_idx = {
            new_idx: old_idx
            for tok, (new_idx, old_idx) in data['mapping'].items()
        }
Beispiel #9
0
def read_tsv(path: str):
    assert os.path.exists(path)
    with IO.reader(path) as f:
        yield from (line.split('\t') for line in f)
Beispiel #10
0
    def __init__(self,
                 exp: Experiment,
                 model: Optional[NMTModel] = None,
                 model_factory: Optional[Callable] = None,
                 optim: str = 'ADAM',
                 **optim_args):
        self.start_step = 0
        self.last_step = -1
        self.exp = exp
        optim_state = None
        if model:
            self.model = model
        else:
            args = exp.model_args
            assert args
            assert model_factory
            self.model, args = model_factory(exp=exp, **args)
            exp.model_args = args
            last_model, self.last_step = self.exp.get_last_saved_model()
            if last_model:
                self.start_step = self.last_step + 1
                log.info(
                    f"Resuming training from step:{self.start_step}, model={last_model}"
                )
                state = torch.load(last_model)
                model_state = state[
                    'model_state'] if 'model_state' in state else state
                if 'optim_state' in state:
                    optim_state = state['optim_state']
                self.model.load_state_dict(model_state)
            else:
                log.info(
                    "No earlier check point found. Looks like this is a fresh start"
                )

        # making optimizer
        optim_args['lr'] = optim_args.get('lr', 0.1)
        optim_args['betas'] = optim_args.get('betas', [0.9, 0.98])
        optim_args['eps'] = optim_args.get('eps', 1e-9)

        warmup_steps = optim_args.pop('warmup_steps', 8000)
        self._smoothing = optim_args.pop('label_smoothing', 0.1)
        constant = optim_args.pop('constant', 2)

        self.model = self.model.to(device)

        inner_opt = Optims[optim].new(self.model.parameters(), **optim_args)
        if optim_state:
            log.info("restoring optimizer state from checkpoint")
            try:
                inner_opt.load_state_dict(optim_state)
            except Exception:
                log.exception("Unable to restore optimizer, skipping it.")
        self.opt = NoamOpt(self.model.model_dim,
                           constant,
                           warmup_steps,
                           inner_opt,
                           step=self.start_step)

        optim_args.update(
            dict(warmup_steps=warmup_steps,
                 label_smoothing=self._smoothing,
                 constant=constant))
        if self.exp.read_only:
            self.tbd = NoOpSummaryWriter()
        else:
            self.tbd = SummaryWriter(log_dir=str(exp.work_dir / 'tensorboard'))

        self.exp.optim_args = optim, optim_args
        if not self.exp.read_only:
            self.exp.persist_state()
        self.samples = None
        if exp.samples_file.exists():
            with IO.reader(exp.samples_file) as f:
                self.samples = [line.strip().split('\t') for line in f]
                log.info(f"Found {len(self.samples)} sample records")
                if self.start_step == 0:
                    for samp_num, sample in enumerate(self.samples):
                        self.tbd.add_text(f"sample/{samp_num}",
                                          " || ".join(sample), 0)

            from rtg.module.decoder import Decoder
            self.decoder = Decoder.new(self.exp, self.model)

        if self.start_step == 0:
            self.init_embeddings()
        self.model = self.model.to(device)
Beispiel #11
0
def load_conf(inp: Union[str, Path]):
    with IO.reader(inp) as fh:
        return yaml.load(fh)
Beispiel #12
0
    def __init__(self,
                 exp: Experiment,
                 model: Optional[NMTModel] = None,
                 model_factory: Optional[Callable] = None,
                 optim: str = 'ADAM',
                 **optim_args):
        self.last_step = -1
        self.exp = exp
        optim_state = None
        if model:
            self.model = model
        else:
            args = exp.model_args
            assert args
            assert model_factory
            self.model, args = model_factory(exp=exp, **args)
            exp.model_args = args
            last_model, self.last_step = self.exp.get_last_saved_model()
            if last_model:
                log.info(
                    f"Resuming training from step:{self.last_step}, model={last_model}"
                )
                state = torch.load(last_model, map_location=device)
                model_state = state[
                    'model_state'] if 'model_state' in state else state

                if 'optim_state' in state:
                    optim_state = state['optim_state']
                self.model.load_state_dict(model_state)
                if 'amp_state' in state and dtorch.fp16:
                    log.info("Restoring  AMP state")
                    dtorch._scaler.load_state_dict(state['amp_state'])
            else:
                log.info(
                    "No earlier check point found. Looks like this is a fresh start"
                )

        # optimizer : default args for missing fields
        for k, v in self.default_optim_args.items():
            optim_args[k] = optim_args.get(k, v)

        self.n_gpus = torch.cuda.device_count()
        self.device_ids = list(range(self.n_gpus))

        inner_opt_args = {
            k: optim_args[k]
            for k in ['lr', 'betas', 'eps', 'weight_decay', 'amsgrad']
        }

        self.core_model = self.model.to(device)

        trainable_params = self.exp.config['optim'].get('trainable', {})
        if trainable_params:
            if drtorch.is_distributed:  # model is wrapped in DP or DistributedDP
                log.warning(
                    f">> Using more than 1 GPU with 'trainable' params is NOT tested"
                )
            trainable_params = self.core_model.get_trainable_params(
                include=trainable_params.get('include'),
                exclude=trainable_params.get('exclude'))
        else:
            trainable_params = self.model.parameters()

        inner_opt = Optims[optim].new(trainable_params, **inner_opt_args)
        self.model = dtorch.maybe_distributed(self.core_model)

        if optim_state:
            log.info("restoring optimizer state from checkpoint")
            try:
                inner_opt.load_state_dict(optim_state)
            except Exception:
                log.exception("Unable to restore optimizer, skipping it.")
        self.opt = NoamOpt(self.core_model.model_dim,
                           optim_args['constant'],
                           optim_args['warmup_steps'],
                           inner_opt,
                           step=self.start_step,
                           inv_sqrt=optim_args['inv_sqrt'])

        if self.exp.read_only:
            self.tbd = NoOpSummaryWriter()
        else:
            self.tbd = SummaryWriter(log_dir=str(exp.work_dir / 'tensorboard'))

        self.exp.optim_args = optim, optim_args
        if not self.exp.read_only:
            self.exp.persist_state()
        self.samples = None
        if exp.samples_file and exp.samples_file.exists():
            with IO.reader(exp.samples_file) as f:
                self.samples = [line.strip().split('\t') for line in f]
                log.info(f"Found {len(self.samples)} sample records")
                if self.start_step == 0:
                    for samp_num, sample in enumerate(self.samples):
                        self.tbd.add_text(f"sample/{samp_num}",
                                          " || ".join(sample), 0)

            from rtg.module.decoder import Decoder
            self.decoder = Decoder.new(self.exp, self.core_model)

        if self.start_step <= 1:
            self.maybe_init_model()

        self.criterion = self.create_criterion(optim_args['criterion'])