Ejemplo n.º 1
0
def main(input_file: str, output_file: str, checking_file: str,
         keep_rate: float):
    keep_rate = float(keep_rate)
    _gen = Random.get_np_generator(12345)
    rstream = Random.stream(_gen.random_sample)
    # --
    # read input
    stat = {}
    input_sents = list(
        yield_sents(ReaderGetterConf().get_reader(input_path=input_file)))
    stat["input"] = get_stat(input_sents)
    if checking_file:
        checking_sents = list(
            yield_sents(
                ReaderGetterConf().get_reader(input_path=checking_file)))
        stat["check"] = get_stat(checking_sents)
        # collect keys
        hit_keys = set()
        for one_check_sent in checking_sents:
            tok_key = ''.join(one_check_sent.seq_word.vals).lower()
            tok_key = ''.join(tok_key.split())  # split and join again
            hit_keys.add(tok_key)
        # filter
        filtered_sents = []
        for one_input_sent in input_sents:
            tok_key = ''.join(one_input_sent.seq_word.vals).lower()
            tok_key = ''.join(tok_key.split())  # split and join again
            if tok_key not in hit_keys:
                filtered_sents.append(one_input_sent)
    else:
        filtered_sents = input_sents
    stat["filter"] = get_stat(filtered_sents)
    # sample
    if keep_rate < 1.:
        sample_sents = [
            s for r, s in zip(rstream, filtered_sents) if r < keep_rate
        ]
    elif keep_rate > 10:
        sample_sents = [z for z in filtered_sents]
        for _ in range(10):
            _gen.shuffle(sample_sents)
        sample_sents = sample_sents[:int(keep_rate)]
    else:
        sample_sents = filtered_sents
    stat["sample"] = get_stat(sample_sents)
    # write
    if os.path.exists(output_file):
        assert False, f"File exists: {output_file}, delete it first!"
    if output_file:
        with WriterGetterConf().get_writer(output_path=output_file) as writer:
            writer.write_insts(sample_sents)
    # stat
    zlog(
        f"Read {input_file}, check {checking_file}, output {output_file}, stat:"
    )
    OtherHelper.printd(stat)
Ejemplo n.º 2
0
 def _proj_grads(self, flattened_grads):
     _shuffle = self.conf.shuffle_losses
     if _shuffle:
         _gen = Random.get_generator('loss')
     _rates = self.conflicting_change_rates
     # --
     all_g = []
     for i, cur_g in enumerate(flattened_grads):
         new_g = cur_g.clone()
         other_idxes = list(range(len(flattened_grads)))
         if _shuffle:
             _gen.shuffle(other_idxes)
         for j in other_idxes:
             other_g = flattened_grads[j]
             rate = _rates[i][j]
             if rate > 0.:
                 _dot = (new_g * other_g).sum()
                 _other_s2 = (other_g * other_g).sum()
                 _offset = (_dot / _other_s2) * other_g
                 new_g.sub_(rate * ((_dot < 0).float() * _offset))
                 # -- just checking!
                 if BK.get_value(_dot).item() < 0:
                     zlog(
                         f"Here! _dot<0 as _dot={_dot}, _off={_dot / _other_s2}"
                     )
                 # --
         all_g.append(new_g)
     ret = BK.stack(all_g, 0).sum(0)  # [*]
     return ret
Ejemplo n.º 3
0
 def yield_train_yielder(self):
     all_yielders = []
     all_svs = []
     all_inner_rates = []
     for group_name, group_datasets in self.datasets["train"].items():
         all_yielders.append([z.yield_batches() for z in group_datasets])
         one_inner_rates = np.asarray([
             (len(z.items)**z.conf.group_sample_alpha)
             for z in group_datasets
         ])
         all_inner_rates.append(one_inner_rates /
                                one_inner_rates.sum())  # inner sample
         all_svs.append(self.train_sample_svs[group_name])
     _gen = Random.get_generator('stream')
     _n_groups = len(all_svs)
     while True:
         # choose the outer
         if len(all_svs) == 1:
             cur_gidx = 0  # simply 1
         else:
             pvals = np.asarray([z.value for z in all_svs])
             pvals = pvals / pvals.sum()
             cur_gidx = _gen.choice(_n_groups, p=pvals)  # choose group
         # choose the inner
         pvals2 = all_inner_rates[cur_gidx]
         if len(pvals2) == 1:
             cur_iidx = 0
         else:
             cur_iidx = _gen.choice(len(pvals2),
                                    p=pvals2)  # choose inner one
         # choose that one!
         chosen_yielder = all_yielders[cur_gidx][cur_iidx]
         yield chosen_yielder
Ejemplo n.º 4
0
def main(args):
    conf = MainConf()
    conf.update_from_args(args)
    # input
    with zopen(conf.input) as fd:
        lines = list(fd)
        if conf.skip_blank:
            lines = [z for z in lines if str.isspace(z)]
    # shuffle?
    origin_len = len(lines)
    if conf.shuffle_times > 0 or conf.shuffle:
        _t = max(1, conf.shuffle_times)  # at least once!
        _gen = Random.get_generator('')
        for _ in range(_t):
            _gen.shuffle(lines)
    # sample?
    final_size = int(0.999 + (conf.rate *
                              origin_len if conf.rate <= 1. else conf.rate))
    out_lines = lines[:final_size]
    # output
    if conf.output:
        with zopen(conf.output, 'w') as fd2:
            for line in out_lines:
                fd2.write(line)
    # --
    zlog(
        f"Sample({conf.rate}) {conf.input}=>{conf.output}: {origin_len}=>{len(out_lines)}"
    )
Ejemplo n.º 5
0
def iter_arg_choices(m: List, repeat=True, shuffle=True, max_num=-1):
    _gen = Random.get_generator("tune")
    # --
    idx = 0
    # expand fully
    args_pool = None
    if not repeat:
        args_pool = [[]]
        for cur_items in m:
            new_pool = []
            for a in args_pool:
                for one_idx in range(len(cur_items)):
                    new_pool.append(a + [one_idx])
            args_pool = new_pool
        # --
        zlog("** Arrange non-repeat iter, sized %d." % len(args_pool))
        if shuffle:
            for _ in range(10):
                _gen.shuffle(args_pool)
        else:
            args_pool.reverse()  # later using pop
    while True:
        if idx == max_num:
            break
        if repeat:
            sel_idxes = [_gen.randint(len(one)) for one in m]
        else:
            if len(args_pool) > 0:
                sel_idxes = args_pool.pop()
            else:
                break
        # -----
        yield sel_idxes  # return selection idxes
        idx += 1
Ejemplo n.º 6
0
 def __init__(self, cl_helper: CLHelper, ii: int, frames: List):
     super().__init__()
     # --
     self.frames = frames.copy()  # save a copy for shuffle
     self.cl_helper = cl_helper
     self.ii = ii
     self.r_ii = self.cl_helper.cl_rank_idx[ii]
     # --
     self._gen = Random.get_generator('stream')
     self.p = 0  # which point
     self.p_ret = 0  # how many is returned?
Ejemplo n.º 7
0
 def do_presample(insts: List, s: float, shuffle: bool, reverse: bool):
     assert s > 0
     if s < 1.:
         s = len(insts) * s
     s = int(s + 0.99999)
     # --
     ret_idxes = list(range(len(insts)))
     if reverse:
         ret_idxes = list(reversed(ret_idxes))
     if shuffle:
         _gen = Random.get_generator('presample')
         _gen.shuffle(ret_idxes)
     ret_idxes = ret_idxes[:s]
     return [insts[z] for z in ret_idxes], ret_idxes
Ejemplo n.º 8
0
 def yield_batches(self, stream_item, loop: bool, filter_f=None):
     conf = self.conf
     _gen = Random.get_generator('stream')
     _bucket_shuffle_times = conf.bucket_shuffle_times
     if filter_f is None:
         filter_f = lambda x: True  # no drop
     # --
     # prepare
     buckets = self._put_buckets(stream_item)
     orig_counts = [len(b) for b in buckets]
     pvals = np.asarray(orig_counts) / sum(orig_counts)  # for sample!
     arrangers = []
     for b_items in buckets:
         # first shuffle
         for _ in range(_bucket_shuffle_times):
             _gen.shuffle(b_items)
         # get arranger
         input_stream = IterStreamer(b_items, restartable=True)
         arranger = BatchArranger(
             input_stream,
             bsize=conf.batch_size,
             maxi_bsize=conf.batch_maxi_bsize,
             batch_size_f=self.batch_size_f,
             dump_detectors=(lambda x: not filter_f(x)),
             sorting_keyer=(lambda x: len(x)),
             shuffle_batches_times=_bucket_shuffle_times)
         arranger.restart()
         arrangers.append(arranger)
     # go!!
     _len_buckets = len(buckets)
     while True:
         choice = _gen.choice(_len_buckets, p=pvals)  # choose a bucket
         chosen_arranger = arrangers[choice]
         items, _eos = chosen_arranger.next_and_check()
         if _eos:
             if loop:  # simply restart it
                 chosen_arranger.restart()
             else:  # pval clear; note: not change pvals every batch, but maybe does not matter!
                 pvals[choice] = 0.
                 _remain = pvals.sum().item()
                 if _remain <= 0.:
                     break  # finished!!
                 pvals = pvals / _remain
         else:
             yield items
Ejemplo n.º 9
0
 def __init__(self,
              base_streamers: List[Streamer],
              stop_sidx=-1,
              ratios: List[SupportsFloat] = None,
              verbose=True):
     super().__init__(base_streamers)
     # --
     if ratios is None:  # by default all 1
         ratios = [1.] * self._num_streamers
     assert self._num_streamers > 0 and self._num_streamers == len(ratios)
     self._ratios = ratios
     self._stop_sidx = stop_sidx
     self._random_sampler = Random.stream(STREAMER_RANDOM_GEN.random_sample)
     # status
     self._cur_idx = self._num_streamers - 1
     self._cur_ratio = 0.
     self._cur_counts = [0] * self._num_streamers
     self.verbose = verbose
Ejemplo n.º 10
0
def _my_get_params_init(conf: NIConf, shape: Union[List[int], Tuple[int]], init: Union[str, object], lookup: bool):
    # shape is a tuple of dims
    assert init in ["default", "random", "glorot", "ortho", "gaussian", "zeros"], f"Unknown init method {init}"
    poss_scale = conf.init_scale_l if lookup else conf.init_scale_nl
    if len(shape) == 1:  # set bias to 0
        return np.zeros((shape[0],))
    else:
        # get defaults
        if init == "default":
            init = conf.init_def_l if lookup else conf.init_def_nl
        _gen = Random.get_generator("param")
        # specifics
        if init == "glorot":
            if lookup:  # special for lookups
                shape_g = (shape[-1], )  # fan-out for lookup
            else:
                shape_g = shape
            w0 = _gen.random_sample(shape)  # [0,1)
            w0 = (w0-0.5)*(2*(np.sqrt(3.0*len(shape_g)/(sum(shape_g)))))
            return w0*poss_scale
        elif init == "random":
            w0 = _gen.random_sample(shape)  # [0,1)
            w0 = (w0-0.5)*2
            return w0*poss_scale
        elif init == "gaussian":
            w0 = _randn_clip(_gen, shape, 2.)  # clip to [-2, 2]
            return w0*poss_scale
        elif init == "ortho":
            # todo(note): always assume init square matrices
            assert len(shape)==2 and (shape[0] % shape[1] == 0 or shape[1] % shape[0] == 0), f"Bad shape {shape} for ortho_init!"
            orig_num = shape[0] // shape[1]
            if orig_num == 0:
                num = shape[1] // shape[0]
            else:
                num = orig_num
            if num == 1:
                w0 = _ortho_weight(_gen, shape[1])
            else:
                w0 = np.concatenate([_ortho_weight(_gen, shape[1]) for _ in range(num)])
            if orig_num == 0:  # reverse it!
                w0 = np.transpose(w0)
            return w0*poss_scale
        elif init == "zeros":
            return np.zeros(shape)
Ejemplo n.º 11
0
 def _get_grads(self,
                params,
                flatten: bool,
                drop_whole=0.,
                drop_partial=0.):
     grads = [
         p.grad.detach().clone()
         if p.grad is not None else BK.zeros(p.shape) for p in params
     ]
     if drop_whole > 0.:
         _gen = Random.get_generator('loss')
         _mask = (_gen.random(len(grads)) < drop_whole)
         grads = [(g * 0. if m else g) for g, m in zip(grads, _mask)]
     if drop_partial > 0.:
         grads = [
             g * (BK.rand(g.shape) < drop_partial).float() for g in grads
         ]
     if flatten:
         return BK.concat([z.flatten() for z in grads], 0)
     else:
         return grads
Ejemplo n.º 12
0
 def filter_embed(self,
                  wv: 'WordVectors',
                  init_nohit=None,
                  scale=1.0,
                  assert_all_hit=False):
     if init_nohit is None:  # auto decide by wv
         init_nohit = np.mean([np.std(z) for z in wv.vecs]).item()
         zlog(f"Auto decide init_nohit={init_nohit}")
     if init_nohit <= 0.:
         get_nohit = lambda s: np.zeros((s, ), dtype=np.float32)
     else:
         _generator = Random.get_generator("vocab")
         # get_nohit = lambda s: (_generator.random_sample((s,)).astype(np.float32)-0.5) * (2*init_nohit)
         get_nohit = lambda s: _generator.standard_normal(s) * init_nohit
     #
     ret = []
     res = defaultdict(int)
     embed_size = wv.get_emb_size()
     # for w in self.keys():  # todo(+N): once a bug!
     for w in self.full_i2w:
         hit, norm_name, norm_w = wv.norm_until_hit(w)
         if hit:
             value = np.asarray(wv.get_vec(norm_w, norm_name=False),
                                dtype=np.float32)
             res[norm_name] += 1
         else:
             value = get_nohit(embed_size)
             # value = np.zeros((embed_size,), dtype=np.float32)
             res["no-hit"] += 1
         ret.append(value)
     # --
     if assert_all_hit:
         assert res[
             "no-hit"] == 0, f"Filter-embed error: assert all-hit but get no-hit of {res['no-hit']}"
     zret = np.asarray(ret, dtype=np.float32) * scale
     zlog(
         f"Filter pre-trained embed {self}->{zret.shape}: {res}, no-hit is inited with {init_nohit}."
     )
     return zret
Ejemplo n.º 13
0
        c_stream = CacheStreamer(i_stream, shuffle_times=cache_shuffle_times)
    return c_stream

# especially for training
def train_prep_stream(in_stream: Streamer, tconf: TConf):
    # for training, we get all the sentences!
    assert tconf.train_stream_mode == "sent", "Currently we only support sent training!"
    sent_stream = FListWrapperStreamer(
        in_stream, lambda d: [x for x in yield_sents([d]) if len(x) <= tconf.train_max_length and len(x) >= tconf.train_min_length and (len(x.events) > 0 or next(_BS_sample_stream) > tconf.train_skip_noevt_rate)])  # filter out certain sents!
    if tconf.train_stream_reshuffle_times > 0:  # reshuffle for sents
        sent_stream = ShuffleStreamer(sent_stream, shuffle_bsize=tconf.train_stream_reshuffle_bsize,
                                      shuffle_times=tconf.train_stream_reshuffle_times)
    return sent_stream

# function to get BatchArranger
_BK_gen = Random.get_generator("train")
_BS_sample_stream = Random.stream(_BK_gen.random_sample)
def batch_stream(in_stream: Streamer, tconf: TConf, training: bool):
    _sent_counter = lambda d: len(list(yield_sents([d])))
    _tok_counter = lambda d: sum(len(s) for s in yield_sents([d]))
    _frame_counter = lambda d: sum(len(s.events) for s in yield_sents([d]))
    _ftok_counter = lambda d: sum(max(1, len(s.events))*len(s) for s in yield_sents([d]))
    batch_size_f_map = {"sent": _sent_counter, "tok": _tok_counter, "frame": _frame_counter, "ftok": _ftok_counter}
    if training:
        batch_size_f = batch_size_f_map[tconf.train_count_mode]
        b_stream = BatchArranger(in_stream, bsize=tconf.train_batch_size, maxi_bsize=tconf.train_maxibatch_size,
                                 batch_size_f=batch_size_f, dump_detectors=None, single_detectors=None, sorting_keyer=lambda x: len(x),
                                 shuffle_batches_times=tconf.train_batch_shuffle_times)
    else:
        batch_size_f = batch_size_f_map[tconf.test_count_mode]
        b_stream = BatchArranger(in_stream, bsize=tconf.test_batch_size, maxi_bsize=1, batch_size_f=batch_size_f,
Ejemplo n.º 14
0
 def run(self):
     conf = self.conf
     last_report_uidx, last_dev_uidx = 0, 0
     # --
     if conf.valid_first:  # valid before training
         self.validate()
     # --
     _lrate_warmup_factor, _lrate_warmup_steps = self.lrate_warmup_factor, self.lrate_warmup_steps
     _skip_batch = conf.skip_batch
     _gen0 = Random.get_generator("train")
     _gen = Random.stream(_gen0.random_sample)
     # --
     _accu_checker = 0
     _accu_batch = conf.accu_batch
     # --
     # start before loop
     self.adjust_scheduled_values()
     # loop
     act_lrate = None
     while True:  # loop over and over
         _train_stream = self.get_train_stream(
         )  # we may change train_stream!!
         # --
         if _train_stream.is_inactive(
         ):  # check to avoid restart after load_progress
             _train_stream.restart()
         insts, _eos = _train_stream.next_and_check()
         if _eos:  # end of epoch
             zlog(
                 f"End of epoch at {self.tp.current_suffix(False)}: Current act_lrate is {act_lrate}.",
                 func="plain",
                 timed=True)
             if conf.valid_epoch:
                 last_dev_uidx = self.tp.uidx
                 self.validate()
                 # todo(+N): do we need to adjust sv at a finer grained?
                 self.adjust_scheduled_values()  # adjust after validation
             if self._finished():
                 break
             self.tp.update_eidx(1)
             continue
         # skip batch?
         if _skip_batch > 0 and next(_gen) < _skip_batch:
             continue
         if self.train_discard_batch_f(insts):
             continue  # discard this batch due to some specific reasons (like noevt)
         # run fb (possibly split batch)
         self.fb_batch(insts, 1. / _accu_batch)
         self.tp.update_iidx(len(insts))
         # ==
         # only update for certain accu fb runs
         _accu_checker += 1
         if _accu_checker % _accu_batch == 0:
             self.tp.update_uidx(1)
             cur_uidx = self.tp.uidx
             # get the effective lrate and update
             act_lrate = float(
                 self.lrate.value)  # start with the lrate.value
             if cur_uidx < _lrate_warmup_steps:  # linear increase
                 act_lrate *= (cur_uidx / _lrate_warmup_steps)
             else:  # decrease
                 act_lrate *= _lrate_warmup_factor * (
                     cur_uidx**conf.lrate_decrease_alpha)
             self._run_update(act_lrate, 1.)
             # --
             # report on training process
             if conf.flag_verbose and (
                     cur_uidx - last_report_uidx) >= conf.report_ufreq:
                 zlog(
                     f"Report at {self.tp.current_suffix(False)}: Current act_lrate is {act_lrate}.",
                     func="plain",
                     timed=True)
                 self._run_train_report()
                 last_report_uidx = cur_uidx
             # valid?
             if (cur_uidx - last_dev_uidx) >= conf.valid_ufreq:
                 last_dev_uidx = self.tp.uidx
                 self.validate()
                 # todo(+N): do we need to adjust sv at a finer grained?
                 self.adjust_scheduled_values()  # adjust after validation
                 if self._finished():
                     break
         # =====
     zlog(f"Finish training because of: {self._reach_ends()}", func="plain")
     zlog(
         f"zzzzzfinal: After training, the best point is: {self.tp.info_best()}.",
         func="report")
Ejemplo n.º 15
0
#

from typing import Union, Iterable, List, Callable, SupportsFloat
from msp2.utils import zfatal, Random, Constants, zlog

# =====
# basic

STREAMER_RANDOM_GEN = Random.get_generator('stream')


# basic streamer
class Streamer:
    def __init__(self):
        self.eos = None  # by default, EOS is None
        # status
        self._count = 0
        self._max_count = 0
        self._restart_times = 0
        self._active = False
        self._stack = []

    def __repr__(self):
        return f"{self.__class__.__name__}(A={self._active},R={self._restart_times},C={self._count})"

    def __iter__(self):
        self.restart()  # for convenience
        return self

    def __next__(self):
        one = self.next()