class DataLoaderRegistry(BaseDataLoaderRegistry): add_argument('data_path', dtype='path', msg='Path to the dataset.') add_argument('src_lang', dtype=str, msg='ISO code for the source language.') add_argument('tgt_lang', dtype=str, msg='ISO code for the target language.') add_argument('input_format', dtype=str, choices=['wikt', 'ielex'], default='ielex', msg='Input format.') def get_data_loader(self, setting: BaseSetting, cog_reg: CognateRegistry, **kwargs) -> BaseDataLoader: if setting.task == 'one_pair': # TODO(j_luo) The options can all be part of setting. dl_cls = VSOnePairDataLoader if g.use_rl else OnePairDataLoader dl = dl_cls(setting, cog_reg, **kwargs) else: raise ValueError(f'Cannot understand this task "{setting.task}".') return dl
class BaseTrainer(BaseTrainerDev): add_argument('num_steps', default=1000, dtype=int, msg='Number of steps for training.') add_argument('save_model', dtype=bool, default=True, msg='Flag to save model.') add_argument('almt_reg_hyper', dtype=float, default=0.0, msg='Hyperparameter for alignment.') add_argument('weight_decay', dtype=float, default=0.0, msg='Hyperparameter for weight decay.') add_argument('concentration_scale', dtype=float, default=1.0, msg='Hyperparameter for concentration scale.') add_argument('train_mode', dtype=str, default='mle', choices=['mle', 'mrt'], msg='Training mode: either MRT or MLE.') add_argument('init_entropy_reg', dtype=float, default=0.0, msg='Initial entropy regularization hyperparameter.') add_argument('end_entropy_reg', dtype=float, default=0.0, msg='Bound for entropy regularization hyperparameter.') add_argument( 'when_entropy_reg', dtype=int, default=100, msg='When to reach the bound for entropy regularization hyperparameter.' ) def add_trackables(self): self.tracker.add_count_trackable('step', g.num_steps) def save(self, eval_metrics: Metrics): if g.save_model: path = g.log_dir / 'saves' / f'model.{self.stage}.pth' path.parent.mkdir(exist_ok=True, parents=True) torch.save(self.model.state_dict(), path) logging.info(f'Model saved to {path}.') else: logging.info('No model is saved.')
class OneToManyModel(BaseModel): add_argument('lang_emb_mode', default='mean', dtype=str, choices=['random', 'mean', 'lang2vec', 'wals'], msg='Mode for the language embedding module.') add_argument( 'l2v_feature_set', default=None, dtype=str, choices=[ 'phonology_average', 'phonology_wals', 'phonology_ethnologue', 'learned' ], msg='Which feature set to use for the lang2vec language embeddings.') def __init__(self, num_src_chars: int, num_tgt_chars: int, num_tgt_langs: int, unseen_idx: int, lang2id: Optional[Dict[str, int]] = None, phono_feat_mat: Optional[LT] = None, special_ids: Optional[Sequence[int]] = None): super().__init__(num_src_chars, num_tgt_chars, phono_feat_mat=phono_feat_mat, special_ids=special_ids) self.lang_emb = LanguageEmbedding(num_tgt_langs, g.char_emb_size, unseen_idx=unseen_idx, lang2id=lang2id, mode=g.lang_emb_mode, dropout=g.dropout) def _prepare_lang_emb(self, batch: Union[OnePairBatch, SourceOnlyBatch]) -> FT: return self.lang_emb(batch.tgt_lang_id)
class BeamSearcher(BaseSearcher): add_argument('beam_size', default=200, dtype=int, msg='Size of beam.') def search_by_probs(self, lengths: LT, label_log_probs: FT) -> Tuple[LT, FT]: max_length = lengths.max().item() bs = label_log_probs.size('batch') label_log_probs = label_log_probs.align_to('length', 'batch', 'label') beam = Beam(bs) for step in range(max_length): __label_log_probs = label_log_probs[step] # __lengths = lengths[step] within_length = (step < lengths).align_as( __label_log_probs) # __lengths beam.extend(__label_log_probs * within_length.float()) beam.finish_search(lengths) samples = beam.samples.rename(beam='sample') sample_log_probs = beam.sample_log_probs.rename(beam='sample') return samples, sample_log_probs
class G2PLayer(nn.Module): add_argument('g2p_window_size', default=3, dtype=int, msg='Window size for g2p layer.') def __init__(self, lu_size: int, ku_size: int): """`lu_size`: number of lost units, `ku_size`: number of known units.""" super().__init__() self.unit_aligner = nn.Embedding(lu_size, ku_size) logging.imp('Unit aligner initialized to 0.') self.unit_aligner.weight.data.fill_(0.0) self.conv = nn.Conv1d(g.dim, g.dim, g.g2p_window_size, padding=g.g2p_window_size // 2) self.dropout = nn.Dropout(g.dropout) def forward(self, ku_id_seqs: LT, lu_repr: FT) -> Tuple[FT, FT]: """Returns lu x ku representation and bs x l x ku representation.""" ku_char_weight = self.unit_aligner.weight ku_char_repr = ku_char_weight @ lu_repr ku_char_repr = ku_char_repr.refine_names('ku_char_emb', 'char_emb') with NoName(ku_char_repr, ku_id_seqs): _ku_repr = ku_char_repr[ku_id_seqs].rename('batch', 'length', 'char_emb') _ku_repr = _ku_repr.align_to('batch', 'char_emb', ...) with NoName(_ku_repr): ku_ctx_repr = self.conv(_ku_repr).rename('batch', 'char_emb', 'length') ku_ctx_repr = ku_ctx_repr.align_to(..., 'char_emb') ku_ctx_repr = self.dropout(ku_ctx_repr) return ku_char_repr, ku_ctx_repr
from dev_misc.devlib.dp import EditDist from dev_misc.devlib.tensor_x import TensorX as Tx from dev_misc.trainlib import Metric, Metrics from dev_misc.trainlib.tb_writer import MetricWriter from dev_misc.utils import pad_for_log, pbar from sound_law.data.alphabet import Alphabet from sound_law.data.data_loader import OnePairBatch, OnePairDataLoader from sound_law.evaluate.edit_dist import edit_dist_all from sound_law.rl.mcts import Mcts from sound_law.s2s.decoder import get_beam_probs from sound_law.s2s.one_pair import OnePairModel from sound_law.train.trainer import get_ce_loss add_argument('eval_mode', dtype=str, default='edit_dist', choices=['prob', 'edit_dist'], msg='Evaluation mode using probabilities or edit distance.') add_argument('comp_mode', dtype=str, default='str', choices=['ids', 'units', 'str', 'ids_gpu'], msg='Comparison mode.') add_argument('use_phono_edit_dist', dtype=bool, default=False, msg='Flag to use phonologically-aware edit distance.') add_argument('phono_edit_dist_scale', dtype=float, default=1.0, msg='Scaling factor for phonological edit distance.')
import sound_law.rl.rule as rule from pathlib import Path from dev_misc import add_argument, g if __name__ == "__main__": add_argument("calc_metric", dtype=bool, default=False, msg="Whether to calculate the metrics.") add_argument("out_path", dtype=str, msg="Path to the output file.") manager, gold, states, refs = rule.simulate() initial_state = states[0] if g.in_path: assert len(gold) == len(states) - 1 if g.out_path: with Path(g.out_path).open('w', encoding='utf8') as fout: for state in states: fout.write(f'{state.dist}\n') if g.calc_metric: # compute the similarity between the candidate ruleset and the gold standard ruleset candidate: List[ SoundChangeAction] = None # let this be the model's ruleset, which we are comparing to gold # first, what % of the gold ruleset is present in candidate? n_shared_actions = 0 n_similar_actions = 0 # similar actions get half credit. We count separately so these are stored as int # TODO(djwyen) weight "partial credit" based on how similar the effects of the rules are, which can be calculated off distance for action in gold: similar_actions = manager.action_space.get_similar_actions(action) for candidate_act in candidate:
class Mcts(PyMcts): """Monte Carlo Tree Search class. Everything should be done on cpu except for evaluation. Use numpy arrays by default since we can potentially speed up some process through cython and parallel processing. """ add_argument('puct_c', default=5.0, dtype=float, msg='Exploration constant.') add_argument('virtual_loss', default=1.0, dtype=float, msg='Virtual loss per game.') add_argument('game_count', default=3, dtype=int, msg='How many virtual games lost.') add_argument('heur_c', default=1.0, dtype=float, msg='Heuristic constant.') add_argument('mixing', default=0.5, dtype=float, msg='Mixing lambda hyperparameter.') add_argument('num_workers', default=4, dtype=int, msg='Number of workers for parallelizing MCTS.') add_argument('dirichlet_alpha', default=0.03, dtype=float, msg='Alpha value for the Dirichlet noise.') add_argument('noise_ratio', default=0.25, dtype=float, msg='Mixing ratio for the Dirichlet noise.') add_argument('play_strategy', default='max', dtype=str, choices=['max', 'sample_ac', 'sample_mv'], msg='Play strategy.') add_argument('exponent', default=1.0, dtype=float, msg='The exponent for sample_ac play strategy.') def __init__(self, *args, agent: BasePG = None, **kwargs): self.agent = agent if g.play_strategy == 'max': self.play_strategy = PyPS_MAX else: self.play_strategy = PyPS_SAMPLE_AC def reset(self): # Clear priors first and then stats -- stats are needed to speed up clearing. self.env.clear_priors(self.env.start, True) self.env.clear_stats(self.env.start, True) logging.debug(f'#trie nodes {self.env.evict(500000)}') def evaluate(self, states, steps: Optional[Union[int, LT]] = None) -> List[float]: """Expand and evaluate the leaf node.""" values = [None] * len(states) outstanding_idx = list() outstanding_states = list() # Deal with end states first. for i, state in enumerate(states): if state.stopped or state.done: # NOTE(j_luo) This value is used for backup. If already reaching the end state, the final reward is either accounted for by the step reward, or by the value network. Therefore, we need to set it to 0.0 here. values[i] = 0.0 else: outstanding_idx.append(i) outstanding_states.append(state) # Collect states that need evaluation. if outstanding_states: almts1 = almts2 = None if g.use_alignment: id_seqs, almts1, almts2 = parallel_stack_ids( outstanding_states, g.num_workers, True, self.env.max_end_length) almts1 = get_tensor(almts1).rename('batch', 'word', 'pos') almts2 = get_tensor(almts2).rename('batch', 'word', 'pos') else: id_seqs = parallel_stack_ids(outstanding_states, g.num_workers, False, self.env.max_end_length) id_seqs = get_tensor(id_seqs).rename('batch', 'word', 'pos') if steps is not None and not isinstance(steps, int): steps = steps[outstanding_idx] # TODO(j_luo) Scoped might be wrong here. # with ScopedCache('state_repr'): # NOTE(j_luo) Don't forget to call exp(). priors = self.agent.get_policy(id_seqs, almts=(almts1, almts2)).exp() with NoName(priors): meta_priors = priors[:, [0, 2, 3, 4, 5, 6]].cpu().numpy() special_priors = priors[:, 1].cpu().numpy() if g.use_value_guidance: agent_values = self.agent.get_values( id_seqs, steps=steps).cpu().numpy() else: agent_values = np.zeros([len(id_seqs)], dtype='float32') for i, state, mp, sp, v in zip(outstanding_idx, outstanding_states, meta_priors, special_priors, agent_values): # NOTE(j_luo) Values should be returned even if states are duplicates or have been visited. values[i] = v # NOTE(j_luo) Skip duplicate states (due to exploration collapse) or visited states (due to rollout truncation). if not state.is_leaf(): continue # print(mp[1, 111]) self.env.evaluate(state, mp, sp) return values def add_noise(self, state: VocabState): """Add Dirichlet noise to `state`, usually the root.""" noise = np.random.dirichlet( g.dirichlet_alpha * np.ones(7 * len(self.env.abc))).astype('float32') noise = noise.reshape(7, -1) meta_noise = noise[:6] special_noise = noise[6, :6] self.env.add_noise(state, meta_noise, special_noise, g.noise_ratio) def collect_episodes(self, init_state: VocabState, tracker: Optional[Tracker] = None, num_episodes: int = 0, is_eval: bool = False, no_simulation: bool = False) -> List[Trajectory]: trajectories = list() self.agent.eval() if is_eval: self.eval() else: self.train() num_episodes = num_episodes or g.num_episodes # if no_simulation: # breakpoint() # BREAKPOINT(j_luo) with self.agent.policy_grad(False), self.agent.value_grad(False): for ei in range(num_episodes): root = init_state self.reset() steps = 0 if g.use_finite_horizon else None self.evaluate([root], steps=steps) # Episodes have max rollout length. played_path = None for ri in range(g.max_rollout_length): if not is_eval: self.add_noise(root) if is_eval and no_simulation: new_state = self.select_one_pi_step(root) steps = steps + 1 if g.use_finite_horizon else None values = self.evaluate([new_state], steps=steps) else: # Run many simulations before take one action. Simulations take place in batches. Each batch # would be evaluated and expanded after batched selection. num_batches = g.num_mcts_sims // g.expansion_batch_size for _ in range(num_batches): paths, steps = self.select(root, g.expansion_batch_size, ri, g.max_rollout_length, played_path) steps = get_tensor( steps) if g.use_finite_horizon else None new_states = [ path.get_last_node() for path in paths ] values = self.evaluate(new_states, steps=steps) self.backup(paths, values) if tracker is not None: tracker.update('mcts', incr=g.expansion_batch_size) if ri == 0 and ei % g.episode_check_interval == 0: k = min(20, root.num_actions) logging.debug( pad_for_log( str( get_tensor( root.action_counts).topk(k)))) logging.debug( pad_for_log(str(get_tensor(root.q).topk(k)))) logging.debug( pad_for_log( str(get_tensor(root.max_values).topk(k)))) ps = self.play_strategy if is_eval: if no_simulation: ps = PyPS_SAMPLE_AC else: ps = PyPS_MAX new_path = self.play(root, ri, ps, g.exponent) if played_path is None: played_path = new_path else: played_path.merge(new_path) root = played_path.get_last_node() # print('3') if tracker is not None: tracker.update('rollout') if root.stopped or root.done: break # self.show_stats() trajectory = Trajectory(played_path, self.env.max_end_length) if ei % g.episode_check_interval == 0: logging.debug(pad_for_log(str(trajectory))) trajectories.append(trajectory) if tracker is not None: tracker.update('episode') # if no_simulation: # breakpoint() # BREAKPOINT(j_luo) return trajectories
import pandas as pd from dev_misc import Arg, add_argument, add_check, g from dev_misc.utils import handle_sequence_inputs from pypheature.nphthong import Nphthong from pypheature.process import FeatureProcessor from .alphabet import Alphabet from .dataset import OnePairDataset from .setting import Setting Lang = NewType('Lang', str) DF = pd.DataFrame add_argument('use_stress', dtype=bool, default=True, msg='Flag to use stress.') add_argument('use_duration', dtype=bool, default=True, msg='Flag to use duration (long or short).') add_argument('use_diacritics', dtype=bool, default=True, msg='Flag to use diacritics.') add_argument('use_duplicate_phono', dtype=bool, default=True, msg='Whether to keep duplicate symbols based on their phonological features.') add_argument('noise_level', dtype=float, default=0.0, msg='Noise level on the target side.') add_argument('stress_included', dtype=bool, default=False, msg='Flag to indicate that stress has already been included in the source language.') add_check( (Arg('use_duplicate_phono') == False) | (Arg('separate_output') == True) | (Arg('use_phono_features') == False)) _fp = FeatureProcessor() @handle_sequence_inputs
class DecipherEvaluator(BaseEvaluator): add_argument('eval_max_num_samples', default=0, dtype=int, msg='Max number of samples to evaluate on.') def __init__(self, model: DecipherModel, dl_reg: DataLoaderRegistry, tasks: Sequence[DecipherTask]): self.model = model self.dl_reg = dl_reg self.tasks = tasks self.analyzer = DecipherAnalyzer() def evaluate(self, stage: str) -> Metrics: metrics = Metrics() with torch.no_grad(): self.model.eval() for task in self.tasks: dl = self.dl_reg[task] task_metrics = self._evaluate_one_data_loader(dl, stage) metrics += task_metrics.with_prefix_(task) return metrics def _evaluate_one_data_loader(self, dl: ContinuousTextDataLoader, stage: stage) -> Metrics: task = dl.task accum_metrics = Metrics() # Get all metrics from batches. dfs = list() total_num_samples = 0 for batch in dl: if g.eval_max_num_samples and total_num_samples + batch.batch_size > g.eval_max_num_samples: logging.imp( f'Stopping at {total_num_samples} < {g.eval_max_num_samples} evaluated examples from {task}.' ) break model_ret = self.model(batch) batch_metrics, batch_df = self.predict(model_ret, batch) accum_metrics += batch_metrics # accum_metrics += self.analyzer.analyze(model_ret, batch) total_num_samples += batch.batch_size dfs.append(batch_df) df = pd.concat(dfs, axis=0) # Write the predictions to file. out_path = g.log_dir / 'predictions' / f'{task}.{stage}.tsv' out_path.parent.mkdir(exist_ok=True, parents=True) df.to_csv(out_path, index=None, sep='\t') # Compute P/R/F scores. accum_metrics += get_prf_scores(accum_metrics) return accum_metrics def _get_predictions(self, model_ret: DecipherModelReturn, batch: ContinuousIpaBatch) -> List[Segmentation]: label_log_probs = model_ret.probs.label_log_probs.align_to( 'batch', 'length', 'label') _, tag_seqs = label_log_probs.max(dim='label') tag_seqs = tag_seqs.align_to('batch', 'sample', 'length').int() lengths = batch.lengths.align_to('batch', 'sample').int() segment_list = None if self.model.vocab is not None: segment_list = [segment.segment_list for segment in batch.segments] packed_words = self.model.pack(tag_seqs, lengths, batch.feat_matrix, batch.segments, segment_list=segment_list) segments_by_batch = packed_words.sampled_segments_by_batch # Only take the first (and only) sample. predictions = [segments[0] for segments in segments_by_batch] return predictions def predict(self, model_ret: DecipherModelReturn, batch: ContinuousIpaBatch) -> Tuple[Metrics, pd.DataFrame]: metrics = Metrics() predictions = self._get_predictions(model_ret, batch) ground_truths = [ segment.to_segmentation() for segment in batch.segments ] matching_stats = get_matching_stats(predictions, ground_truths) metrics += matching_stats df = _get_df(batch.segments, ground_truths, predictions) return metrics, df
def __hash__(self): return hash(tuple(self.segment_list)) class BaseSegmentWithGoldTagSeq(BaseSegment): has_gold_tag_seq: ClassVar[bool] = True @property @abstractmethod def gold_tag_seq(self) -> LT: ... add_argument('min_word_length', default=4, dtype=int, msg='Min length of words.') class Segment(BaseSegmentWithGoldTagSeq): def __init__(self, raw_token: str): self._raw_token = raw_token self.is_noise = raw_token.startswith('#') self.token = raw_token[1:] if self.is_noise else raw_token self.ipa = get_string(self.token) self._merged = False if len(self.ipa) == 0: raise ValueError('Invalid IPA string.') self._apply_all() self._merge() self._indexify()
class LstmDecoder(nn.Module, BaseBeamSearcher): """A decoder that unrolls the LSTM decoding procedure by steps.""" add_argument('input_feeding', default=False, dtype=bool, msg='Flag to use input feeding.') def __init__(self, char_emb: CharEmbedding, cell: MultiLayerLSTMCell, attn: GlobalAttention, hidden: nn.Linear, nc_residual: NormControlledResidual, dropout: float = 0.0): super().__init__() self.char_emb = char_emb self.cell = cell self.attn = attn self.hidden = hidden self.nc_residual = nc_residual self.drop = nn.Dropout(dropout) @classmethod def from_params(cls, dec_params: DecParams, embedding: Optional[CharEmbedding] = None) -> LstmDecoder: emb_params = dec_params.emb_params lstm_params = dec_params.lstm_params if emb_params is None and embedding is None: raise ValueError( 'Must specify either `emb_params` or `embedding`.') char_emb = get_embedding( emb_params) if embedding is None else embedding cell = MultiLayerLSTMCell.from_params(lstm_params) attn = GlobalAttention(dec_params.src_hidden_size, dec_params.tgt_hidden_size) hidden = nn.Linear( dec_params.src_hidden_size + dec_params.tgt_hidden_size, dec_params.tgt_hidden_size) nc_residual = NormControlledResidual( norms_or_ratios=dec_params.norms_or_ratios, control_mode=dec_params.control_mode) return LstmDecoder(char_emb, cell, attn, hidden, nc_residual, lstm_params.dropout) def forward(self, sot_id: int, src_emb: FT, src_outputs: FT, mask_src: BT, max_length: Optional[int] = None, target: Optional[LT] = None, lang_emb: Optional[FT] = None) -> Tuple[FT, FT]: # Prepare inputs. max_length = self._get_max_length(max_length, target) batch_size = mask_src.size('batch') input_ = self._prepare_first_input(sot_id, batch_size, mask_src.device) prev_att = get_zeros(batch_size, g.hidden_size) if g.input_feeding else None state = LstmStatesByLayers.zero_state(self.cell.num_layers, batch_size, self.attn.input_tgt_size, bidirectional=False) # Main loop. log_probs = list() almt_distrs = list() with ScopedCache('Wh_s'): for l in range(max_length): state, log_prob, almt_distr, prev_att = self._forward_step( input_, src_emb, state, src_outputs, mask_src, lang_emb=lang_emb, prev_att=prev_att) if target is None: input_ = log_prob.max(dim=-1)[1].rename('batch') else: input_ = target[l] log_probs.append(log_prob) almt_distrs.append(almt_distr) # Prepare outputs. with NoName(*log_probs), NoName(*almt_distrs): log_probs = torch.stack(log_probs).rename('pos', 'batch', 'unit') almt_distrs = torch.stack(almt_distrs).rename( 'tgt_pos', 'batch', 'src_pos') return log_probs, almt_distrs def _get_max_length(self, max_length: Optional[int], target: Optional[LT]) -> int: if self.training: assert target is not None assert target.names[1] == 'batch' assert len(target.shape) == 2 if max_length is None: max_length = target.size("pos") return max_length def _prepare_first_input(self, sot_id: int, batch_size: int, device: torch.device) -> FT: input_ = torch.full([batch_size], sot_id, dtype=torch.long).rename('batch').to(device) return input_ def _forward_step(self, input_: LT, src_emb: FT, state: LstmStatesByLayers, src_states: FT, mask_src: BT, lang_emb: Optional[FT] = None, prev_att: Optional[FT] = None) -> Tuple[FT, FT, FT, FT]: emb = self.char_emb(input_) if lang_emb is not None: emb = emb + lang_emb inp = torch.cat([emb, prev_att], dim=-1) if g.input_feeding else emb hid_rnn, next_state = self.cell( inp, state) # hid_rnn has gone through dropout already. almt, ctx = self.attn.forward(hid_rnn, src_states, mask_src) # So has src_states. with NoName(hid_rnn, ctx): cat = torch.cat([hid_rnn, ctx], dim=-1) hid_cat = self.hidden(cat) hid_cat = self.drop(hid_cat) with NoName(src_emb, hid_cat, almt): ctx_emb = (src_emb * almt.t().unsqueeze(dim=-1)).sum(dim=0) hid_res = self.nc_residual(ctx_emb, hid_cat).rename('batch', 'hidden') logit = self.char_emb.project(hid_res) log_prob = logit.log_softmax(dim=-1).refine_names('batch', 'unit') return next_state, log_prob, almt, hid_res def is_finished(self, beam: Beam) -> BT: return beam.finished def get_next_candidates(self, beam: Beam) -> Candidates: nh = NameHelper() def collapse_beam(orig, is_lstm_state: bool = False): def wrapped(tensor): return nh.flatten(tensor, ['batch', 'beam'], 'BB').rename(BB='batch') if is_lstm_state: return orig.apply(wrapped) return wrapped(orig) prev_att = collapse_beam(beam.prev_att) if g.input_feeding else None state, log_probs, almt, att = self._forward_step( collapse_beam(beam.tokens), beam.constants.src_emb, collapse_beam(beam.lstm_state, is_lstm_state=True), beam.constants.src_outputs, beam.constants.src_paddings, lang_emb=beam.constants.lang_emb, prev_att=prev_att) def unflatten(orig, is_lstm_state: bool = False): def wrapped(tensor): return nh.unflatten(tensor.rename(batch='BB'), 'BB', ['batch', 'beam']) if is_lstm_state: return orig.apply(wrapped) return wrapped(orig) log_probs = unflatten(log_probs) state = unflatten(state, is_lstm_state=True) almt = unflatten(almt) att = unflatten(att) return Candidates(log_probs, state, almt, att) def get_next_beam(self, beam: Beam, cand: Candidates) -> Beam: nh = NameHelper() # Get the new scores. For finished hypotheses, we should keep adding EOT. placeholder = torch.full_like(cand.log_probs, -9999.9) placeholder[..., EOT_ID] = 0.0 new_scores = torch.where(beam.finished.align_as(placeholder), placeholder, cand.log_probs) accum = new_scores + beam.accum_scores.align_as(cand.log_probs) lp = nh.flatten(accum, ['beam', 'unit'], 'BU') top_s, top_i = torch.topk(lp, beam.beam_size, dim='BU') num_units = accum.size('unit') beam_i = top_i // num_units tokens = top_i % num_units batch_i = get_named_range(beam.batch_size, 'batch') batch_i = batch_i.align_as(top_i) def retrieve(tensor, last_name: str = 'hidden') -> torch.Tensor: with NoName(tensor, batch_i, beam_i): ret = tensor[batch_i, beam_i] new_names = ('batch', 'beam') if last_name: new_names += (last_name, ) return ret.refine_names(*new_names) next_scores = top_s.rename(BU='beam') next_tokens = tokens.rename(BU='beam') next_beam_ids = beam_i.rename(BU='beam') next_state = cand.state.apply(retrieve) next_almt = retrieve(cand.almt, last_name='tgt_pos') next_att = retrieve(cand.att, last_name='hidden') if g.input_feeding else None last_finished = retrieve(beam.finished, last_name=None) this_ended = next_tokens == EOT_ID reached_max = (beam.step + 1 == beam.constants.max_lengths) next_finished = last_finished | this_ended | reached_max next_beam = beam.follow(next_finished, next_scores, next_tokens, next_state, next_beam_ids, next_almt, prev_att=next_att) return next_beam def search(self, sot_id: int, src_emb: FT, src_outputs: FT, src_paddings: BT, src_lengths: LT, beam_size: int, lang_emb: Optional[FT] = None) -> Hypotheses: if beam_size <= 0: raise ValueError(f'`beam_size` must be positive.') batch_size = src_emb.size('batch') tokens = torch.full([batch_size, beam_size], sot_id, dtype=torch.long).to(src_emb.device).rename( 'batch', 'beam') accum_scores = torch.full_like(tokens, -9999.9).float() accum_scores[:, 0] = 0.0 init_att = None if g.input_feeding: init_att = get_zeros(batch_size, beam_size, g.hidden_size).rename('batch', 'beam', 'hidden') lstm_state = LstmStatesByLayers.zero_state( self.cell.num_layers, batch_size, beam_size, self.attn.input_tgt_size, bidirectional=False, names=['batch', 'beam', 'hidden']) def expand_beam(orig, collapse: bool = True): if collapse: return torch.repeat_interleave(orig, beam_size, dim='batch') else: return duplicate(orig, 'batch', beam_size, 'beam') src_emb = expand_beam(src_emb) src_outputs = expand_beam(src_outputs) src_paddings = expand_beam(src_paddings) max_lengths = (src_lengths.float() * 1.5).long() max_lengths = expand_beam(max_lengths, collapse=False) constants = BeamConstant(src_emb, src_outputs, src_paddings, max_lengths, lang_emb=lang_emb) init_beam = Beam(0, accum_scores, tokens, lstm_state, constants, prev_att=init_att) hyps = super().search(init_beam) return hyps def get_hypotheses(self, final_beam: Beam) -> Hypotheses: btb = final_beam.trace_back('tokens', 'almt') tokens = btb['tokens'] almt = btb['almt'] return Hypotheses(tokens, almt, final_beam.accum_scores)
class BaseModel(nn.Module): add_argument('char_emb_size', default=256, dtype=int, msg='Embedding size for characters (as input).') add_argument('hidden_size', default=256, dtype=int, msg='Hidden size for LSTM states.') add_argument('num_layers', default=1, dtype=int, msg='Number of LSTM layers.') add_argument('dropout', default=0.2, dtype=float, msg='Dropout rate.') add_argument( 'norms_or_ratios', default=(1.0, 0.2), nargs=2, dtype=float, msg='Norms or ratios of norms for the norm-controlled residual module.' ) add_argument('control_mode', default='relative', dtype=str, choices=['relative', 'absolute', 'none'], msg='Control mode for the norm-controlled residual module.') add_argument('model_encoder_type', dtype=str, default='lstm', choices=['lstm', 'cnn'], msg='Which encoder to use.') add_argument( 'kernel_sizes', dtype=int, nargs='+', default=(3, 5, 7), msg= 'What kernel sizes to use for the CNN Encoder (can include repeats).') add_argument('beam_size', dtype=int, default=1, msg='Beam size.') add_argument( 'separate_output', dtype=bool, default=False, msg='Flag to use a separate set of params for output embeddings.') def __init__(self, num_src_chars: int, num_tgt_chars: int, phono_feat_mat: Optional[LT] = None, special_ids: Optional[Sequence[int]] = None): super().__init__() def get_lstm_params(input_size: int, bidirectional: bool) -> LstmParams: return LstmParams(input_size, g.hidden_size, g.num_layers, g.dropout, bidirectional=bidirectional) def get_emb_params_inner(num_chars: int): return get_emb_params(num_chars, phono_feat_mat, special_ids) enc_emb_params = get_emb_params_inner(num_src_chars) if g.model_encoder_type == 'lstm': enc_lstm_params = get_lstm_params(g.char_emb_size, True) self.encoder = LstmEncoder.from_params(enc_emb_params, enc_lstm_params) else: cnn_params = CnnParams(g.hidden_size, g.kernel_sizes, g.dropout) self.encoder = CnnEncoder.from_params(enc_emb_params, cnn_params) if g.share_src_tgt_abc: dec_emb_params = None dec_embedding = self.encoder.embedding else: dec_emb_params = get_emb_params_inner(num_tgt_chars) dec_embedding = None # NOTE(j_luo) Input size is the sum of `g.char_emb_size` and `g.hidden_size` if input feeding is used. dec_input_size = g.char_emb_size + (g.hidden_size if g.input_feeding else 0) dec_lstm_params = get_lstm_params(dec_input_size, False) dec_params = DecParams( dec_lstm_params, g.hidden_size * 2, # Bidirectional outputs. g.hidden_size, g.norms_or_ratios, g.control_mode, emb_params=dec_emb_params) self.decoder = LstmDecoder.from_params(dec_params, embedding=dec_embedding) def forward(self, batch: OnePairBatch, use_target: bool = True, max_length: int = None) -> Tuple[FT, FT]: src_emb, (output, state) = self.encoder(batch.src_seqs.ids, batch.src_seqs.lengths) target = batch.tgt_seqs.ids if use_target else None lang_emb = self._prepare_lang_emb(batch) log_probs, almt_distrs = self.decoder(SOT_ID, src_emb, output, batch.src_seqs.paddings, max_length=max_length, target=target, lang_emb=lang_emb) return log_probs, almt_distrs def get_scores(self, batch: OnePairBatch, tgt_vocab_seqs: PaddedUnitSeqs, chunk_size: int = 100) -> FT: """Given a batch and a list of target tokens (provided as id sequences), return scores produced by the model.""" src_emb, (output, state) = self.encoder(batch.src_seqs.ids, batch.src_seqs.lengths) src_emb = src_emb.refine_names('pos', 'batch', 'src_emb') output = output.refine_names('pos', 'batch', 'output') batch_size = src_emb.size('batch') lang_emb = self._prepare_lang_emb(batch) def create_chunk(size, base, old_chunk, interleave: bool = True): if not interleave: return base.repeat(1, batch_size) if old_chunk is not None and old_chunk.size( 'batch') == batch_size * size: return old_chunk new_chunk = torch.repeat_interleave(base, size, dim='batch') return new_chunk chunk_src_emb = None chunk_output = None chunk_src_paddings = None scores = list() for split in pbar(tgt_vocab_seqs.split(chunk_size), desc='Get scores: chunk'): split: PaddedUnitSeqs bs_split = len(split) chunk_src_emb = create_chunk(bs_split, src_emb, chunk_src_emb) chunk_output = create_chunk(bs_split, output, chunk_output) chunk_src_paddings = create_chunk(bs_split, batch.src_seqs.paddings, chunk_src_paddings) chunk_target = create_chunk(None, split.ids, None, interleave=False) chunk_tgt_paddings = create_chunk(None, split.paddings, None, interleave=False) chunk_log_probs, _ = self.decoder(SOT_ID, chunk_src_emb, chunk_output, chunk_src_paddings, target=chunk_target, lang_emb=lang_emb) chunk_scores = chunk_log_probs.gather('unit', chunk_target) chunk_scores = (chunk_scores * chunk_tgt_paddings).sum('pos') with NoName(chunk_scores): scores.append( chunk_scores.view(batch_size, bs_split).refine_names( 'batch', 'tgt_vocab')) scores = torch.cat(scores, dim='tgt_vocab') return scores def predict(self, batch: Union[SourceOnlyBatch, OnePairBatch]) -> Hypotheses: src_emb, (output, state) = self.encoder(batch.src_seqs.ids, batch.src_seqs.lengths) src_emb = src_emb.refine_names('pos', 'batch', 'src_emb') output = output.refine_names('pos', 'batch', 'output') lang_emb = self._prepare_lang_emb(batch) hyps = self.decoder.search(SOT_ID, src_emb, output, batch.src_seqs.paddings, batch.src_seqs.lengths, g.beam_size, lang_emb=lang_emb) return hyps @abstractmethod def _prepare_lang_emb(self, batch: OnePairBatch) -> FT: ...
if interpret_matching: gold_id, cand_ids = match gold_block = gold[gold_id] cand_rules = [cand[j] if j > -1 else None for j in cand_ids] cost = objective.GetCoefficient(v[name]) print('---') print('gold block', gold_id, ':', gold_block) print('matched to rules:', cand_rules) print('with dist', str(cost)) return matching, status, final_value, max_cost, size_cnt if __name__ == "__main__": add_argument("match_proportion", dtype=float, default=.7, msg="Proportion of gold blocks to force matches on") add_argument("k_matches", dtype=int, default=10, msg="Number of matches to consider per gold block") add_argument("interpret_matching", dtype=bool, default=False, msg="Flag to print out the rule matching") add_argument('cand_path', dtype=str, default='data/toy_cand_rules.txt', msg='Path to the candidate rule file.') add_argument('out_path', dtype=str, msg='File to write the results to.') add_argument('max_power_set_size',
class ExtractEvaluator(BaseEvaluator): add_argument( 'matched_threshold', default=0.99, dtype=float, msg='Value of threshold to determine whether two words are matched.') def __init__(self, model: ExtractModel, dl: ContinuousTextDataLoader): self.model = model self.dl = dl self.analyzer = ExtractAnalyzer() def evaluate(self, stage: str) -> Metrics: segments = list() predictions = list() ground_truths = list() matched_segments = list() total_num_samples = 0 analyzed_metrics = Metrics() for batch in pbar(self.dl, desc='eval_batch'): if g.eval_max_num_samples and total_num_samples + batch.batch_size > g.eval_max_num_samples: logging.imp( f'Stopping at {total_num_samples} < {g.eval_max_num_samples} evaluated examples.' ) break ret = self.model(batch) analyzed_metrics += self.analyzer.analyze(ret, batch) segments.extend(list(batch.segments)) segmentations, _matched_segments = self._get_segmentations( ret, batch) predictions.extend(segmentations) matched_segments.extend(_matched_segments) ground_truths.extend( [segment.to_segmentation() for segment in batch.segments]) total_num_samples += batch.batch_size df = _get_df(segments, ground_truths, predictions, matched_segments, columns=('segment', 'ground_truth', 'prediction', 'matched_segment')) out_path = g.log_dir / 'predictions' / f'extract.{stage}.tsv' out_path.parent.mkdir(exist_ok=True, parents=True) df.to_csv(out_path, index=None, sep='\t') matching_stats = get_matching_stats(predictions, ground_truths) prf_scores = get_prf_scores(matching_stats) return analyzed_metrics + matching_stats + prf_scores def _get_segmentations( self, model_ret: ExtractModelReturn, batch: ContinuousIpaBatch ) -> Tuple[List[Segmentation], np.ndarray]: # Get the best matched nll. start = model_ret.start end = model_ret.end bmv = model_ret.best_matched_vocab bmnll = -model_ret.best_matched_ll matched = bmnll < self.model.threshold start = start.cpu().numpy() end = end.cpu().numpy() bmv = bmv.cpu().numpy() bmw = self.model.vocab[bmv] # Best matched word segmentations = list() matched_segments = list() for segment, s, e, m, w in zip(batch.segments, start, end, matched, bmw): spans = list() if len(segment) >= g.min_word_length and m: span = [segment[i] for i in range(s, e + 1)] span = Span('-'.join(span), s, e) spans.append(span) matched_segments.append(w) else: matched_segments.append('') segmentations.append(Segmentation(spans)) return segmentations, matched_segments
class MctsTrainer(RLTrainer): add_argument('num_mcts_sims', default=100, dtype=int, msg='Number of MCTS simulations to run.') add_argument('expansion_batch_size', default=10, dtype=int, msg='Batch size for expansion steps.') add_argument('mcts_batch_size', default=128, dtype=int, msg='Batch size for optimizing the MCTS agent.') add_argument('replay_buffer_size', default=1024, dtype=int, msg='Size for the replay buffer.') add_argument('num_episodes', default=10, dtype=int, msg='Number of episodes.') add_argument('num_inner_steps', default=10, dtype=int, msg='Number of optimization step per batch.') add_argument('episode_check_interval', default=10, dtype=int, msg='Frequency of checking episodes') add_argument( 'tolerance', default=5, dtype=int, msg= 'Tolerance is the maximum number of epochs without improving best score before early-stoppping.' ) add_argument('regress_lambda', default=0.01, dtype=float, msg='Hyperparameter for regression loss.') add_argument('use_value_guidance', default=True, dtype=bool, msg='Flag to use predicted values to guide the search.') add_argument('tau', default=0.0, dtype=float, msg='Temperature for sampling episodes.') add_argument('improved_player_only', default=False, dtype=bool, msg='Flag to use only improved player between epochs.') def __init__(self, *args, mcts: Mcts = None, **kwargs): if mcts is None: raise TypeError( f'Must pass a trajectory collector to initialize this trainer.' ) if g.num_mcts_sims % g.expansion_batch_size > 0: raise ValueError( f'`expansion_batch_size should divide `num_mcts_sims`.') self.mcts = mcts super().__init__(*args, **kwargs) hparams = dict() for k, v in g.as_dict().items(): if not isinstance(v, (int, float, bool, str)): v = str(v) hparams[k] = v self.metric_writer.add_hparams(hparams, dict()) self.replay_buffer = ReplayBuffer() self.best_metrics = Metrics() self._old_state = dict() def add_trackables(self): super().add_trackables() self.tracker.add_count_trackable('tolerance', total=g.tolerance) step = self.tracker['step'] episode = step.add_trackable('episode', total=g.num_episodes, endless=True) episode.add_trackable('rollout', total=g.max_rollout_length, endless=True) episode.add_trackable('mcts', total=g.num_mcts_sims, endless=True) step.add_trackable('inner_step', total=g.num_inner_steps, endless=True) def evaluate_at_start(self): metrics = self.evaluator.evaluate(self.stage, 0) self._update_best_score(metrics) self.metric_writer.add_metrics(self.best_metrics, self.tracker['step'].value) def _update_best_score(self, eval_metrics: Metrics) -> bool: best_score = -99999.9 try: best_score = self.best_metrics.best_score.value except AttributeError: pass new_score = eval_metrics['eval/eval_reward'].value logging.info(f'Best score is {max(new_score, best_score):.3f}.') if new_score >= best_score: # The same as the best score is tolerated. best_score = new_score self.best_metrics = Metrics(Metric('best_score', best_score, 1)) best_path = g.log_dir / 'best_run' with best_path.open('w') as fout: fout.write(self.stage) return True return False def should_terminate(self, eval_metrics: Metrics) -> bool: if not self._update_best_score(eval_metrics): logging.imp('eval_reward has not been improved.') self.tracker.update('tolerance') if g.improved_player_only: logging.imp('Loading old state dict.') self.agent.load_state_dict(self._old_state) else: self.tracker.reset('tolerance') self.metric_writer.add_metrics(self.best_metrics, self.tracker['step'].value) return self.tracker.is_finished('tolerance') def train_one_step(self, dl: OnePairDataLoader): if g.improved_player_only: self._old_state = self.agent.state_dict() # Collect episodes with the latest agent first. new_tr = self.mcts.collect_episodes(self.mcts.env.start, self.tracker) # new_tr = self.mcts.collect_episodes(dl.init_state, dl.end_state, self.tracker) tr_rew = Metric('reward', sum(tr.rewards.sum() for tr in new_tr), g.num_episodes) tr_len = Metric('trajectory_length', sum(map(len, new_tr)), g.num_episodes) success = Metric('success', sum(tr.done for tr in new_tr), g.num_episodes) metrics = Metrics(tr_rew, tr_len, success) # Add these new episodes to the replay buffer. for i, tr in enumerate(new_tr, 1): global_step = i + self.tracker['step'].value * g.num_episodes self.metric_writer.add_scalar('episode_reward', tr.rewards.sum(), global_step=global_step) self.metric_writer.add_text('trajectory', str(tr), global_step=global_step) # NOTE(j_luo) Use temperature if it's positive. if g.tau > 0.0: weight = math.exp(tr.total_reward * 10.0) else: weight = 1.0 for tr_edge in tr: self.replay_buffer.append(tr_edge, weight) # Main loop. from torch.optim import SGD, Adam optim_cls = Adam if g.optim_cls == 'adam' else SGD optim_kwargs = dict() if optim_cls == SGD: optim_kwargs['momentum'] = 0.9 self.set_optimizer(optim_cls, lr=g.learning_rate, weight_decay=g.weight_decay, **optim_kwargs) with self.agent.policy_grad(True), self.agent.value_grad(True): for _ in range(g.num_inner_steps): # Get a batch of training trajectories from the replay buffer. edge_batch = self.replay_buffer.sample(g.mcts_batch_size) # edge_batch = np.random.choice(self.replay_buffer, size=g.mcts_batch_size) agent_inputs = AgentInputs.from_edges( edge_batch) # , self.mcts.env)#, sparse=True) self.agent.train() self.optimizer.zero_grad() policies = self.agent.get_policy(agent_inputs.id_seqs, almts=(agent_inputs.almts1, agent_inputs.almts2)) # print(policies[:, 2, self.mcts.env.abc['ẽ']].exp().mean()) # values = self.agent.get_values(agent_inputs.id_seqs, steps=agent_inputs.steps) # breakpoint() # BREAKPOINT(j_luo) with NoName(policies, agent_inputs.permissible_actions): mask = agent_inputs.permissible_actions == SENTINEL_ID pa = agent_inputs.permissible_actions pa = torch.where(mask, torch.zeros_like(pa), pa) logits = policies.gather(2, pa) logits = torch.where(mask, torch.full_like(logits, -9999.9), logits) logits = logits.log_softmax(dim=-1) # r_max = agent_inputs.rewards.max() # r_min = agent_inputs.rewards.min() # weights = (agent_inputs.rewards - r_min) / (r_max - r_min + 1e-8) # weights = weights.align_as(pi_ce_losses) entropies = (-agent_inputs.mcts_pis * (1e-8 + agent_inputs.mcts_pis).log()).sum(dim=-1) pi_ce_losses = (-agent_inputs.mcts_pis * logits).sum(dim=-1) - entropies for i in range(7): metrics += Metric(f'entropy_{i}', entropies[:, i].sum(), g.mcts_batch_size) metrics += Metric(f'pi_ce_los_{i}', pi_ce_losses[:, i].sum(), g.mcts_batch_size) # v_regress_losses = 0.5 * (values - agent_inputs.qs) ** 2 # pi_ce_loss = Metric('pi_ce_loss', (weights * pi_ce_losses).sum(), g.mcts_batch_size * 7) # mini_weights = get_tensor([1.0, 0.1, 1.0, 0.1, 0.1, 0.1, 0.1]).rename('mini').align_as(pi_ce_losses) # pi_ce_loss = Metric('pi_ce_loss', (mini_weights * pi_ce_losses).sum(), g.mcts_batch_size * 7) pi_ce_loss = Metric('pi_ce_loss', pi_ce_losses.sum(), g.mcts_batch_size * 7) # pi_ce_loss = Metric('pi_ce_loss', pi_ce_losses[:, 0].sum(), g.mcts_batch_size) # v_regress_loss = Metric('v_regress_loss', v_regress_losses.sum(), g.mcts_batch_size) total_loss = pi_ce_loss.total # + g.regress_lambda * v_regress_loss.total total_loss = Metric('total_loss', total_loss, g.mcts_batch_size) total_loss.mean.backward() # Clip gradient norm. grad_norm = clip_grad(self.agent.parameters(), g.mcts_batch_size) # metrics += Metrics(total_loss, pi_ce_loss, v_regress_loss, grad_norm) metrics += Metrics(total_loss, pi_ce_loss, grad_norm) self.optimizer.step() self.tracker.update('inner_step') return metrics
from itertools import product from typing import (ClassVar, Dict, Iterator, List, Optional, Sequence, Set, Union) import numpy as np import torch import sound_law.rl.trajectory as tr from dev_misc import BT, add_argument, g, get_tensor, get_zeros from dev_misc.utils import Singleton, pbar from sound_law.data.alphabet import (ANY_ID, ANY_S_ID, ANY_UNS_ID, EMP, EMP_ID, EOT_ID, NULL_ID, SOT_ID, SYL_EOT_ID, Alphabet) add_argument('factorize_actions', dtype=bool, default=False, msg='Flag to factorize the action space.') add_argument('ngram_path', dtype='path', msg='Path to the ngram list.') @dataclass(eq=True, frozen=True) class SoundChangeAction: """One sound change rule.""" before_id: int after_id: int rtype: str pre_id: int d_pre_id: int post_id: int d_post_id: int
import json from multiprocessing import set_start_method from typing import Optional import numpy as np import torch from dev_misc import Initiator, add_argument, g, parse_args from dev_misc.devlib.named_tensor import patch_named_tensors from dev_misc.trainlib import set_random_seeds from sound_law.config import a2c_reg, mcts_reg, reg, s2s_reg from sound_law.train.manager import OnePairManager, OneToManyManager add_argument('task', dtype=str, default='one_pair', choices=['one_pair', 'one_to_many'], msg='Which task to execute.') add_argument('use_rl', dtype=bool, default=False, msg='Flag to use RL framework.') add_argument('use_mcts', dtype=bool, default=False, msg='Flag to use MCTS.') add_argument('agent', dtype=str, default='vpg', choices=['vpg', 'a2c'], msg='RL agent.') def setup() -> Initiator:
from sound_law.rl.env import SoundChangeEnv # , TrajectoryCollector from sound_law.rl.mcts import Mcts # pylint: disable=no-name-in-module from sound_law.rl.mcts_cpp import ( # pylint: disable=no-name-in-module PyActionSpaceOpt, PyEnv, PyEnvOpt, PyMctsOpt, PyWordSpaceOpt) # pylint: enable=no-name-in-module from sound_law.rl.trajectory import VocabState from sound_law.s2s.module import CharEmbedding, EmbParams, PhonoEmbedding from sound_law.s2s.one_pair import OnePairModel from sound_law.s2s.one_to_many import OneToManyModel from .trainer import MctsTrainer, Trainer # from .trainer import MctsTrainer, PolicyGradientTrainer, Trainer add_argument('batch_size', default=32, dtype=int, msg='Batch size.') add_argument('check_interval', default=10, dtype=int, msg='Frequency to check the training progress.') add_argument('eval_interval', default=100, dtype=int, msg='Frequency to call the evaluator.') add_argument('save_interval', dtype=int, msg='Frequency to save the progress and the model.') add_argument('learning_rate', default=2e-3, dtype=float, msg='Learning rate.') add_argument('value_learning_rate', default=2e-3, dtype=float,
class SoundChangeEnv(PyEnv): tnode_cls = VocabState add_argument(f'final_reward', default=1.0, dtype=float, msg='Final reward for reaching the end.') add_argument(f'step_penalty', default=0.02, dtype=float, msg='Penalty for each step if not the end state.') def __init__(self, *args, abc: Alphabet = None, **kwargs): self.abc = abc # # Set class variable for `SoundChangeAction` here. SoundChangeAction.abc = abc # Register unconditional actions first. units = [u for u in abc if u not in abc.special_units] def register_uncondional_action(u1: str, u2: str, cl: bool = False, gb: bool = False): id1 = abc[u1] id2 = abc[u2] if cl: self.register_cl_map(id1, id2) elif gb: if u1.startswith('i'): self.register_gbj_map(id1, id2) else: assert u1.startswith('u') self.register_gbw_map(id1, id2) else: self.register_permissible_change(id1, id2) for u1, u2 in abc.edges: register_uncondional_action(u1, u2) for u in units: register_uncondional_action(u, EMP) for u1, u2 in abc.cl_map.items(): register_uncondional_action(u1, u2, cl=True) for u1, u2 in abc.gb_map.items(): register_uncondional_action(u1, u2, gb=True) # self.set_vowel_info(abc.vowel_mask, abc.vowel_base, abc.vowel_stress, abc.stressed_vowel, abc.unstressed_vowel) # self.set_glide_info(abc['j'], abc['w']) def __call__(self, state: VocabState, best_i: int, action: SoundChangeAction) -> Tuple[VocabState, bool, float]: return self.step(state, best_i, action) def show_path(self, state: VocabState) -> str: out = list() for action_id, reward in state.get_path(): action = self.action_space.get_action(action_id) out.append(f'{action}, {reward:.3f}') return '(' + ', '.join(out) + ')' def apply_action(self, state: VocabState, action: SoundChangeAction) -> VocabState: return super().apply_action(state, action.before_id, action.after_id, action.rtype, action.pre_id, action.d_pre_id, action.post_id, action.d_post_id) def apply_block(self, state: VocabState, block: List[SoundChangeAction], strict: bool = False) -> VocabState: """Apply action to a block of actions sequentially. If `strict` is `False`, Only raise error if none of the rules apply.""" curr_state = state applied = False for action in block: try: curr_state = self.apply_action(curr_state, action) except RuntimeError: if strict: raise RuntimeError(f'Using strict mode, some rule in the block does not apply here.') else: applied = True if not applied: raise RuntimeError(f'None of the rules in the block applies.') return curr_state def get_state_edit_dist(self, state1: VocabState, state2: VocabState) -> float: return super().get_state_edit_dist(state1, state2) def get_num_affected(self, state: VocabState, action: SoundChangeAction) -> int: return super().get_num_affected(state, action.before_id, action.after_id, action.rtype, action.pre_id, action.d_pre_id, action.post_id, action.d_post_id)
class OneToManyManager: """The manager class for single-source-multiple-target scenarios.""" add_argument('train_tgt_langs', dtype=str, nargs='+', msg='Target languages used for training.') @staticmethod def prepare_raw_data( ) -> Tuple[List[str], CognateRegistry, Alphabet, Alphabet]: """Prepare raw data, including the cognates and the alphabets.""" # Prepare cognate registry first. cr = CognateRegistry() all_tgt = sorted([g.tgt_lang] + list(g.train_tgt_langs)) for tgt in all_tgt: cr.add_pair(g.data_path, g.src_lang, tgt) # Get alphabets. Note that the target alphabet is based on the union of all target languages, i.e., a shared alphabet for all. if g.share_src_tgt_abc: src_abc = cr.prepare_alphabet(*(all_tgt + [g.src_lang])) tgt_abc = src_abc else: src_abc = cr.prepare_alphabet(g.src_lang) tgt_abc = cr.prepare_alphabet(*all_tgt) return all_tgt, cr, src_abc, tgt_abc def __init__(self): all_tgt, self.cog_reg, self.src_abc, self.tgt_abc = self.prepare_raw_data( ) # Get stats for unseen units. stats = self.tgt_abc.stats _, test_tgt_path = get_paths(g.data_path, g.src_lang, g.tgt_lang) mask = (stats.sum() == stats.loc[test_tgt_path]) unseen = mask[mask].index.tolist() total = len(stats.loc[test_tgt_path].dropna()) logging.info( f'Unseen units ({len(unseen)}/{total}) for {g.tgt_lang} are: {unseen}.' ) # Get language-to-id mappings. Used only for the targets (i.e., decoder side). self.lang2id = lang2id = {tgt: i for i, tgt in enumerate(all_tgt)} # Get all data loaders. self.dl_reg = DataLoaderRegistry() def create_setting(name: str, tgt_lang: str, split: Split, for_training: bool, keep_ratio: Optional[float] = None, tgt_sot: bool = False) -> Setting: return Setting(name, 'one_pair', split, g.src_lang, tgt_lang, for_training, keep_ratio=keep_ratio, tgt_sot=tgt_sot) test_setting = create_setting(f'test@{g.tgt_lang}', g.tgt_lang, Split('all'), False, keep_ratio=g.test_keep_ratio) settings: List[Setting] = [test_setting] # Get the training languages. for train_tgt_lang in g.train_tgt_langs: if g.input_format == 'ielex': train_split = Split( 'train', [1, 2, 3, 4]) # Use the first four folds for training. dev_split = Split('dev', [5]) # Use the last fold for dev. else: train_split = Split('train') dev_split = Split('dev') train_setting = create_setting(f'train@{train_tgt_lang}', train_tgt_lang, train_split, True, keep_ratio=g.keep_ratio) train_e_setting = create_setting(f'train@{train_tgt_lang}_e', train_tgt_lang, train_split, False, keep_ratio=g.keep_ratio) dev_setting = create_setting(f'dev@{train_tgt_lang}', train_tgt_lang, dev_split, False) test_setting = create_setting(f'test@{train_tgt_lang}', train_tgt_lang, Split('test'), False) settings.extend( [train_setting, train_e_setting, dev_setting, test_setting]) for setting in settings: self.dl_reg.register_data_loader(setting, self.cog_reg, lang2id=lang2id) phono_feat_mat = special_ids = None if g.use_phono_features: phono_feat_mat = get_tensor(self.src_abc.pfm) special_ids = get_tensor(self.src_abc.special_ids) self.model = OneToManyModel(len(self.src_abc), len(self.tgt_abc), len(g.train_tgt_langs) + 1, lang2id[g.tgt_lang], lang2id=lang2id, phono_feat_mat=phono_feat_mat, special_ids=special_ids) if g.saved_model_path is not None: self.model.load_state_dict( torch.load(g.saved_model_path, map_location=torch.device('cpu'))) logging.imp(f'Loaded from {g.saved_model_path}.') if has_gpus(): self.model.cuda() logging.info(self.model) metric_writer = MetricWriter(g.log_dir, flush_secs=5) # NOTE(j_luo) Evaluate on every loader that is not for training. eval_dls = self.dl_reg.get_loaders_by_name( lambda name: 'train' not in name or '_e' in name) self.evaluator = Evaluator(self.model, eval_dls, self.tgt_abc, metric_writer=metric_writer) if not g.evaluate_only: train_names = [ f'train@{train_tgt_lang}' for train_tgt_lang in g.train_tgt_langs ] train_settings = [ self.dl_reg.get_setting_by_name(name) for name in train_names ] self.trainer = Trainer(self.model, train_settings, [1.0] * len(train_settings), 'step', stage_tnames=['step'], evaluator=self.evaluator, check_interval=g.check_interval, eval_interval=g.eval_interval, save_interval=g.save_interval, metric_writer=metric_writer) if g.saved_model_path is None: # self.trainer.init_params('uniform', -0.1, 0.1) self.trainer.init_params('xavier_uniform') optim_cls = Adam if g.optim_cls == 'adam' else SGD self.trainer.set_optimizer(optim_cls, lr=g.learning_rate) def run(self): if g.evaluate_only: # TODO(j_luo) load global_step from saved model. self.evaluator.evaluate('evaluate_only', 0) else: self.trainer.train(self.dl_reg)
class ExtractModel(nn.Module): add_argument('max_num_words', default=3, dtype=int, msg='Max number of extracted words.') add_argument('max_word_length', default=10, dtype=int, msg='Max length of extracted words.') add_argument( 'init_threshold', default=0.05, dtype=float, msg= 'Initial value of threshold to determine whether two words are matched.' ) add_argument('use_adapt', default=False, dtype=bool, msg='Flag to use adapter layer.') add_argument('init_ins_del_cost', default=100, dtype=float, msg='Initial unit cost for insertions and deletions.') add_argument('min_ins_del_cost', default=3.5, dtype=float, msg='Initial unit cost for insertions and deletions.') add_argument('unextracted_prob', default=0.01, dtype=float, msg='Initial unit cost for insertions and deletions.') add_argument('context_weight', default=0.0, dtype=float, msg='Weight for the context probabilities.') add_argument('debug', dtype=bool, default=False, msg='Flag to enter debug mode.') def __init__(self, lu_size: int): super().__init__() def _has_proper_length(segment): l = len(segment) return g.min_word_length <= l <= g.max_word_length with open(g.vocab_path, 'r', encoding='utf8') as fin: _vocab = set(line.strip() for line in fin) segments = [Segment(w) for w in _vocab] self.vocab = get_array([ segment for segment in segments if _has_proper_length(segment) ]) lengths = torch.LongTensor(list(map(len, self.vocab))) feat_matrix = [segment.feat_matrix for segment in self.vocab] feat_matrix = torch.nn.utils.rnn.pad_sequence(feat_matrix, batch_first=True) max_len = lengths.max().item() source_padding = ~get_length_mask(lengths, max_len) self.register_buffer('vocab_feat_matrix', feat_matrix) self.register_buffer('vocab_source_padding', source_padding) self.register_buffer('vocab_length', lengths) self.vocab_feat_matrix.rename_('vocab', 'length', 'feat_group') self.vocab_source_padding.rename_('vocab', 'length') self.vocab_length.rename_('vocab') with Rename(self.vocab_feat_matrix, vocab='batch'): vocab_dense_feat_matrix = convert_to_dense( self.vocab_feat_matrix) self.vocab_dense_feat_matrix = { k: v.rename(batch='vocab') for k, v in vocab_dense_feat_matrix.items() } # Get the entire set of units from vocab. units = set() for segment in self.vocab: units.update(segment.segment_list) self.id2unit = sorted(units) self.unit2id = {u: i for i, u in enumerate(self.id2unit)} # Now indexify the vocab. Gather feature matrices for units as well. indexed_segments = np.zeros([len(self.vocab), max_len], dtype='int64') unit_feat_matrix = dict() for i, segment in enumerate(self.vocab): indexed_segments[i, range(len(segment))] = [ self.unit2id[u] for u in segment.segment_list ] for j, u in enumerate(segment.segment_list): if u not in unit_feat_matrix: unit_feat_matrix[u] = segment.feat_matrix[j] unit_feat_matrix = [unit_feat_matrix[u] for u in self.id2unit] unit_feat_matrix = torch.nn.utils.rnn.pad_sequence( unit_feat_matrix, batch_first=True) self.register_buffer('unit_feat_matrix', unit_feat_matrix.unsqueeze(dim=1)) self.register_buffer('indexed_segments', torch.from_numpy(indexed_segments)) # Use dummy length to avoid the trouble later on. # HACK(j_luo) Have to provide 'length'. self.unit_feat_matrix.rename_('unit', 'length', 'feat_group') self.indexed_segments.rename_('vocab', 'length') with Rename(self.unit_feat_matrix, unit='batch'): unit_dense_feat_matrix = convert_to_dense( self.unit_feat_matrix) self.unit_dense_feat_matrix = { k: v.rename(batch='unit') for k, v in unit_dense_feat_matrix.items() } self.adapter = AdaptLayer() if g.input_format == 'text': self.g2p = G2PLayer(lu_size, len(self.id2unit)) _special_state_keys = [ 'vocab', 'vocab_dense_feat_matrix', 'unit2id', 'id2unit', 'unit_dense_feat_matrix' ] def state_dict(self, **kwargs): state = super().state_dict(**kwargs) for key in self._special_state_keys: attr = drop_names(getattr(self, key)) state[key] = attr return state def load_state_dict(self, state_dict: Dict, **kwargs): with WithholdKeys(state_dict, *self._special_state_keys): super().load_state_dict(state_dict, **kwargs) # HACK(j_luo) This isn't really terse. for key in self._special_state_keys: attr = getattr(self, key) setattr(self, key, state_dict[key]) if torch.is_tensor(attr): names = attr.names getattr(self, key).rename_(*names) elif isinstance(attr, dict): for k, v in getattr(self, key).items(): if torch.is_tensor(v): v.rename_(*attr[k].names) # IDEA(j_luo) The current api is worse than just declaring GlobalProperty(writeable=False) outright. And doesn't give proper type hints. @global_property def threshold(self): pass @global_property def ins_del_cost(self): pass @cached_property def effective_categories(self) -> List[Category]: ret = list() for cat in Category: if should_include(g.feat_groups, cat): ret.append(cat) return ret def forward(self, batch: ExtractBatch) -> ExtractModelReturn: """ The generating story is: v | w | x -- ww -- theta Pr(x) = sum_w Pr(w) Pr(ww) = sum_w Pr(w) theta^|ww| = sum_{w, v} Pr(w | v) Pr(v) theta^|ww| Terminologies: matched_: the prefix after selecting v score: after multiplication with |w| best_: the prefix after selecting w """ # Prepare representations. alignment = None if g.dense_input: # IDEA(j_luo) NoName shouldn't use reveal_name. Just keep the name in the context manager. with NoName(*self.unit_dense_feat_matrix.values()): unit_repr = torch.cat([ self.unit_dense_feat_matrix[cat] for cat in self.effective_categories ], dim=-1) unit_repr = unit_repr.rename('batch', 'length', 'char_emb').squeeze(dim='length') if g.input_format == 'text': ku_char_repr, word_repr = self.g2p(batch.unit_id_seqs, unit_repr) char_log_probs = (ku_char_repr @ unit_repr.t()).log_softmax( dim=-1) alignment = char_log_probs.exp() else: dfm = batch.dense_feat_matrix with Rename(*self.unit_dense_feat_matrix.values(), unit='batch'): adapted_dfm = self.adapter(dfm) with NoName(*adapted_dfm.values()): word_repr = torch.cat([ adapted_dfm[cat] for cat in self.effective_categories ], dim=-1) word_repr.rename_('batch', 'length', 'char_emb') else: with Rename(self.unit_feat_matrix, unit='batch'): word_repr = self.embedding(batch.feat_matrix, batch.source_padding) unit_repr = self.embedding(self.unit_feat_matrix) unit_repr = unit_repr.squeeze('length') unit_repr.rename_(batch='unit') # Main body: extract one span. extracted = Extracted(batch.batch_size) new_extracted = self._extract_one_span(batch, extracted, word_repr, unit_repr, char_log_probs) matches = new_extracted.matches len_e = matches.ll.size('len_e') vs = len(self.vocab) # Get the best score and span. # NOTE(j_luo) Some segments don't have any viable spans. flat_ll = matches.ll.flatten(['len_s', 'len_e', 'vocab'], 'cand') flat_viable = new_extracted.viable.expand_as(matches.ll).flatten( ['len_s', 'len_e', 'vocab'], 'cand') flat_viable_ll = (~flat_viable) * (-9999.9) + flat_ll # Add probs for unextracted characters. unextracted = batch.lengths.align_as( new_extracted.len_candidates) - new_extracted.len_candidates unextracted = unextracted.expand_as(matches.ll) flat_unextracted = unextracted.flatten(['len_s', 'len_e', 'vocab'], 'cand') flat_unextracted_ll = flat_unextracted * math.log(g.unextracted_prob) flat_total_ll = flat_viable_ll + flat_unextracted_ll # Get the top candiates based on total scores. best_matched_ll, best_span_ind = flat_total_ll.max(dim='cand') start = best_span_ind // (len_e * vs) # NOTE(j_luo) Don't forget the length is off by g.min_word_length - 1. end = best_span_ind % (len_e * vs) // vs + start + g.min_word_length - 1 best_matched_vocab = best_span_ind % vs if self.training: any_viable = new_extracted.viable.any('len_s').any('len_e') best_matched_ll = flat_total_ll.logsumexp(dim='cand') best_matched_ll = best_matched_ll * any_viable ret = ExtractModelReturn(start, end, best_matched_ll, best_matched_vocab, new_extracted, alignment) return ret def _extract_one_span(self, batch: ExtractBatch, extracted: Extracted, word_repr: FT, unit_repr: FT, char_log_probs: FT) -> Extracted: # Propose all span start/end positions. start_candidates = get_named_range(batch.max_length, 'len_s').align_to( 'batch', 'len_s', 'len_e') # Range from `min_word_length` to `max_word_length`. len_candidates = get_named_range( g.max_word_length + 1 - g.min_word_length, 'len_e') + g.min_word_length len_candidates = len_candidates.align_to('batch', 'len_s', 'len_e') # This is inclusive. end_candidates = start_candidates + len_candidates - 1 # Only keep the viable/valid spans around. viable = (end_candidates < batch.lengths.align_as(end_candidates)) start_candidates = start_candidates.expand_as(viable) len_candidates = len_candidates.expand_as(viable) # NOTE(j_luo) Use `viable` to get the lengths. `len_candidates` has dummy axes. # IDEA(j_luo) Any better way of handling this? Perhaps persistent names? len_s = viable.size('len_s') len_e = viable.size('len_e') bi = get_named_range(batch.batch_size, 'batch').expand_as(viable) with NoName(start_candidates, end_candidates, len_candidates, bi, viable): viable_starts = start_candidates[viable].rename('viable') viable_lens = len_candidates[viable].rename('viable') viable_bi = bi[viable].rename('viable') # Get the word positions to get the corresponding representations. viable_starts = viable_starts.align_to('viable', 'len_w') word_pos_offsets = get_named_range(g.max_word_length, 'len_w').align_as(viable_starts) word_pos = viable_starts + word_pos_offsets word_pos = word_pos.clamp(max=batch.max_length - 1) # Get the corresponding representations. nh = NameHelper() viable_bi = viable_bi.expand_as(word_pos) word_pos = nh.flatten(word_pos, ['viable', 'len_w'], 'viable_X_len_w') viable_bi = nh.flatten(viable_bi, ['viable', 'len_w'], 'viable_X_len_w') word_repr = word_repr.align_to('batch', 'length', 'char_emb') if g.input_format == 'text': with NoName(word_repr, viable_bi, word_pos, batch.unit_id_seqs): extracted_word_repr = word_repr[viable_bi, word_pos].rename( 'viable_X_len_w', 'char_emb') extracted_unit_ids = batch.unit_id_seqs[ viable_bi, word_pos].rename('viable_X_len_w') else: with NoName(word_repr, viable_bi, word_pos): extracted_word_repr = word_repr[viable_bi, word_pos].rename( 'viable_X_len_w', 'char_emb') extracted_unit_ids = None extracted_word_repr = nh.unflatten(extracted_word_repr, 'viable_X_len_w', ['viable', 'len_w']) # Main body: Run DP to find the best matches. matches = self._get_matches(extracted_word_repr, unit_repr, viable_lens, extracted_unit_ids, char_log_probs) # Revert to the old shape (so that invalid spans are included). bi = get_named_range(batch.batch_size, 'batch').expand_as(viable) lsi = get_named_range(len_s, 'len_s').expand_as(viable) lei = get_named_range(len_e, 'len_e').expand_as(viable) vs = matches.ll.size('vocab') # IDEA(j_luo) NoName shouldn't make size() calls unavaiable. Otherwise size() calls have to be moved outside the context. Also the names should be preserved as well. with NoName(bi, lsi, lei, viable, matches.ll): v_bi = bi[viable] v_lsi = lsi[viable] v_lei = lei[viable] all_ll = get_zeros(batch.batch_size, len_s, len_e, vs) all_ll = all_ll.float().fill_(-9999.9) all_ll[v_bi, v_lsi, v_lei] = matches.ll matches.ll = all_ll.rename('batch', 'len_s', 'len_e', 'vocab') new_extracted = Extracted(batch.batch_size, matches, viable, len_candidates) return new_extracted def _get_matches(self, extracted_word_repr: FT, unit_repr: FT, viable_lens: LT, extracted_unit_ids: LT, char_log_probs: FT) -> Matches: ns = extracted_word_repr.size('viable') len_w = extracted_word_repr.size('len_w') nt = len(self.vocab_feat_matrix) msl = extracted_word_repr.size('len_w') mtl = self.vocab_feat_matrix.size('length') # Compute cosine distances all at once: for each viable span, compare it against all units. ctx_logits = extracted_word_repr @ unit_repr.t() ctx_log_probs = ctx_logits.log_softmax(dim='unit').flatten( ['viable', 'len_w'], 'viable_X_len_w') with NoName(char_log_probs, extracted_unit_ids): global_log_probs = char_log_probs[extracted_unit_ids].rename( 'viable_X_len_w', 'unit') weighted_log_probs = g.context_weight * ctx_log_probs + ( 1.0 - g.context_weight) * global_log_probs costs = -weighted_log_probs # Name: viable x len_w x unit costs = costs.unflatten('viable_X_len_w', [('viable', ns), ('len_w', len_w)]) # NOTE(j_luo) Use dictionary to save every state. fs = dict() for i in range(msl + 1): fs[(i, 0)] = get_zeros(ns, nt).fill_(i * self.ins_del_cost) for j in range(mtl + 1): fs[(0, j)] = get_zeros(ns, nt).fill_(j * self.ins_del_cost) # ------------------------ Main body: DP ----------------------- # # Transition. with NoName(self.indexed_segments, costs): for ls in range(1, msl + 1): min_lt = max(ls - 2, 1) max_lt = min(ls + 2, mtl + 1) for lt in range(min_lt, max_lt): transitions = list() if (ls - 1, lt) in fs: transitions.append(fs[(ls - 1, lt)] + self.ins_del_cost) if (ls, lt - 1) in fs: transitions.append(fs[(ls, lt - 1)] + self.ins_del_cost) if (ls - 1, lt - 1) in fs: vocab_inds = self.indexed_segments[:, lt - 1] sub_cost = costs[:, ls - 1, vocab_inds] transitions.append(fs[(ls - 1, lt - 1)] + sub_cost) if transitions: all_s = torch.stack(transitions, dim=-1) new_s, _ = all_s.min(dim=-1) fs[(ls, lt)] = new_s f_lst = list() for i in range(msl + 1): for j in range(mtl + 1): if (i, j) not in fs: fs[(i, j)] = get_zeros(ns, nt).fill_(9999.9) f_lst.append(fs[(i, j)]) f = torch.stack(f_lst, dim=0).view(msl + 1, mtl + 1, -1, len(self.vocab)) f.rename_('len_w_src', 'len_w_tgt', 'viable', 'vocab') # Get the values wanted. with NoName(f, viable_lens, self.vocab_length): idx_src = viable_lens.unsqueeze(dim=-1) idx_tgt = self.vocab_length viable_i = get_range(ns, 2, 0) vocab_i = get_range(len(self.vocab_length), 2, 1) nll = f[idx_src, idx_tgt, viable_i, vocab_i] nll.rename_('viable', 'vocab') # Get the best spans. matches = Matches(-nll, f) return matches
def simulate( raw_inputs: Optional[List[Tuple[List[str], List[str], List[str]]]] = None ) -> Tuple[OnePairManager, List[SoundChangeAction], List[PlainState], List[str]]: add_argument("in_path", dtype=str, msg="Input path to the saved path file.") # Get alphabet and action space. initiator = setup() initiator.run() manager = OnePairManager() dump = pickle.load(open(g.segments_dump_path, 'rb')) _fp.load_repository(dump['proto_ph_map'].keys()) # Get the list of rules. gold = list() if raw_inputs is not None: for ri in raw_inputs: gold.extend(get_actions(*ri)) elif g.in_path: if str(g.in_path).endswith('tsv'): lines = pd.read_csv(g.in_path, sep='\t')['action'].values # type: ignore else: with open(g.in_path, 'r', encoding='utf8') as fin: lines = [line.strip() for line in fin.readlines()] gold = get_actions(lines) else: df = pd.read_csv('data/test_annotations.csv') df = df.dropna(subset=['ref no.']) for ref in ref_no[g.tgt_lang]: rows = df[df['ref no.'].str.startswith(ref)] gold.extend( get_actions(rows['rule'], orders=rows['order'], refs=rows['ref no.'])) # Simulate the actions and get the distance. PlainState.env = manager.env PlainState.end_state = PlainState(manager.env.end) PlainState.abc = manager.tgt_abc state = PlainState(manager.env.start) states = [state] actions = list() refs = list() expanded_gold = list() # def test(s1, s2): # seq1 = [PlainState.abc[u] for u in s1.split()] # seq2 = [PlainState.abc[u] for u in s2.split()] # return PlainState.env.get_edit_dist(seq1, seq2) logging.info(f"Starting dist: {state.dist:.3f}") for hr in gold: if hr.expandable: action_q = hr.specialize(state) logging.warning(f"This is an expandable rule: {hr}") else: action_q = [hr.to_action()] for action in action_q: logging.info(f"Applying {action}") state = state.apply_action(action) states.append(state) actions.append(action) refs.append(hr.ref) expanded_gold.append(action) logging.info(f"New dist: {state.dist:.3f}") # NOTE(j_luo) We can only score based on expanded rules. gold = expanded_gold return manager, gold, states, refs