Esempio n. 1
0
class DataLoaderRegistry(BaseDataLoaderRegistry):

    add_argument('data_path', dtype='path', msg='Path to the dataset.')
    add_argument('src_lang', dtype=str, msg='ISO code for the source language.')
    add_argument('tgt_lang', dtype=str, msg='ISO code for the target language.')
    add_argument('input_format', dtype=str, choices=['wikt', 'ielex'], default='ielex', msg='Input format.')

    def get_data_loader(self, setting: BaseSetting, cog_reg: CognateRegistry, **kwargs) -> BaseDataLoader:
        if setting.task == 'one_pair':
            # TODO(j_luo) The options can all be part of setting.
            dl_cls = VSOnePairDataLoader if g.use_rl else OnePairDataLoader
            dl = dl_cls(setting, cog_reg, **kwargs)
        else:
            raise ValueError(f'Cannot understand this task "{setting.task}".')
        return dl
Esempio n. 2
0
class BaseTrainer(BaseTrainerDev):

    add_argument('num_steps',
                 default=1000,
                 dtype=int,
                 msg='Number of steps for training.')
    add_argument('save_model',
                 dtype=bool,
                 default=True,
                 msg='Flag to save model.')
    add_argument('almt_reg_hyper',
                 dtype=float,
                 default=0.0,
                 msg='Hyperparameter for alignment.')
    add_argument('weight_decay',
                 dtype=float,
                 default=0.0,
                 msg='Hyperparameter for weight decay.')
    add_argument('concentration_scale',
                 dtype=float,
                 default=1.0,
                 msg='Hyperparameter for concentration scale.')
    add_argument('train_mode',
                 dtype=str,
                 default='mle',
                 choices=['mle', 'mrt'],
                 msg='Training mode: either MRT or MLE.')
    add_argument('init_entropy_reg',
                 dtype=float,
                 default=0.0,
                 msg='Initial entropy regularization hyperparameter.')
    add_argument('end_entropy_reg',
                 dtype=float,
                 default=0.0,
                 msg='Bound for entropy regularization hyperparameter.')
    add_argument(
        'when_entropy_reg',
        dtype=int,
        default=100,
        msg='When to reach the bound for entropy regularization hyperparameter.'
    )

    def add_trackables(self):
        self.tracker.add_count_trackable('step', g.num_steps)

    def save(self, eval_metrics: Metrics):
        if g.save_model:
            path = g.log_dir / 'saves' / f'model.{self.stage}.pth'
            path.parent.mkdir(exist_ok=True, parents=True)
            torch.save(self.model.state_dict(), path)
            logging.info(f'Model saved to {path}.')
        else:
            logging.info('No model is saved.')
Esempio n. 3
0
class OneToManyModel(BaseModel):

    add_argument('lang_emb_mode',
                 default='mean',
                 dtype=str,
                 choices=['random', 'mean', 'lang2vec', 'wals'],
                 msg='Mode for the language embedding module.')
    add_argument(
        'l2v_feature_set',
        default=None,
        dtype=str,
        choices=[
            'phonology_average', 'phonology_wals', 'phonology_ethnologue',
            'learned'
        ],
        msg='Which feature set to use for the lang2vec language embeddings.')

    def __init__(self,
                 num_src_chars: int,
                 num_tgt_chars: int,
                 num_tgt_langs: int,
                 unseen_idx: int,
                 lang2id: Optional[Dict[str, int]] = None,
                 phono_feat_mat: Optional[LT] = None,
                 special_ids: Optional[Sequence[int]] = None):
        super().__init__(num_src_chars,
                         num_tgt_chars,
                         phono_feat_mat=phono_feat_mat,
                         special_ids=special_ids)
        self.lang_emb = LanguageEmbedding(num_tgt_langs,
                                          g.char_emb_size,
                                          unseen_idx=unseen_idx,
                                          lang2id=lang2id,
                                          mode=g.lang_emb_mode,
                                          dropout=g.dropout)

    def _prepare_lang_emb(self, batch: Union[OnePairBatch,
                                             SourceOnlyBatch]) -> FT:
        return self.lang_emb(batch.tgt_lang_id)
Esempio n. 4
0
class BeamSearcher(BaseSearcher):

    add_argument('beam_size', default=200, dtype=int, msg='Size of beam.')

    def search_by_probs(self, lengths: LT,
                        label_log_probs: FT) -> Tuple[LT, FT]:
        max_length = lengths.max().item()
        bs = label_log_probs.size('batch')
        label_log_probs = label_log_probs.align_to('length', 'batch', 'label')
        beam = Beam(bs)
        for step in range(max_length):
            __label_log_probs = label_log_probs[step]
            # __lengths = lengths[step]
            within_length = (step < lengths).align_as(
                __label_log_probs)  # __lengths
            beam.extend(__label_log_probs * within_length.float())
        beam.finish_search(lengths)
        samples = beam.samples.rename(beam='sample')
        sample_log_probs = beam.sample_log_probs.rename(beam='sample')
        return samples, sample_log_probs
Esempio n. 5
0
class G2PLayer(nn.Module):

    add_argument('g2p_window_size',
                 default=3,
                 dtype=int,
                 msg='Window size for g2p layer.')

    def __init__(self, lu_size: int, ku_size: int):
        """`lu_size`: number of lost units, `ku_size`: number of known units."""
        super().__init__()

        self.unit_aligner = nn.Embedding(lu_size, ku_size)
        logging.imp('Unit aligner initialized to 0.')
        self.unit_aligner.weight.data.fill_(0.0)

        self.conv = nn.Conv1d(g.dim,
                              g.dim,
                              g.g2p_window_size,
                              padding=g.g2p_window_size // 2)
        self.dropout = nn.Dropout(g.dropout)

    def forward(self, ku_id_seqs: LT, lu_repr: FT) -> Tuple[FT, FT]:
        """Returns lu x ku representation and bs x l x ku representation."""
        ku_char_weight = self.unit_aligner.weight
        ku_char_repr = ku_char_weight @ lu_repr

        ku_char_repr = ku_char_repr.refine_names('ku_char_emb', 'char_emb')
        with NoName(ku_char_repr, ku_id_seqs):
            _ku_repr = ku_char_repr[ku_id_seqs].rename('batch', 'length',
                                                       'char_emb')
        _ku_repr = _ku_repr.align_to('batch', 'char_emb', ...)
        with NoName(_ku_repr):
            ku_ctx_repr = self.conv(_ku_repr).rename('batch', 'char_emb',
                                                     'length')
        ku_ctx_repr = ku_ctx_repr.align_to(..., 'char_emb')
        ku_ctx_repr = self.dropout(ku_ctx_repr)

        return ku_char_repr, ku_ctx_repr
Esempio n. 6
0
from dev_misc.devlib.dp import EditDist
from dev_misc.devlib.tensor_x import TensorX as Tx
from dev_misc.trainlib import Metric, Metrics
from dev_misc.trainlib.tb_writer import MetricWriter
from dev_misc.utils import pad_for_log, pbar
from sound_law.data.alphabet import Alphabet
from sound_law.data.data_loader import OnePairBatch, OnePairDataLoader
from sound_law.evaluate.edit_dist import edit_dist_all
from sound_law.rl.mcts import Mcts
from sound_law.s2s.decoder import get_beam_probs
from sound_law.s2s.one_pair import OnePairModel
from sound_law.train.trainer import get_ce_loss

add_argument('eval_mode',
             dtype=str,
             default='edit_dist',
             choices=['prob', 'edit_dist'],
             msg='Evaluation mode using probabilities or edit distance.')
add_argument('comp_mode',
             dtype=str,
             default='str',
             choices=['ids', 'units', 'str', 'ids_gpu'],
             msg='Comparison mode.')
add_argument('use_phono_edit_dist',
             dtype=bool,
             default=False,
             msg='Flag to use phonologically-aware edit distance.')
add_argument('phono_edit_dist_scale',
             dtype=float,
             default=1.0,
             msg='Scaling factor for phonological edit distance.')
Esempio n. 7
0
import sound_law.rl.rule as rule
from pathlib import Path
from dev_misc import add_argument, g

if __name__ == "__main__":
    add_argument("calc_metric",
                 dtype=bool,
                 default=False,
                 msg="Whether to calculate the metrics.")
    add_argument("out_path", dtype=str, msg="Path to the output file.")

    manager, gold, states, refs = rule.simulate()
    initial_state = states[0]
    if g.in_path:
        assert len(gold) == len(states) - 1
        if g.out_path:
            with Path(g.out_path).open('w', encoding='utf8') as fout:
                for state in states:
                    fout.write(f'{state.dist}\n')

    if g.calc_metric:
        # compute the similarity between the candidate ruleset and the gold standard ruleset
        candidate: List[
            SoundChangeAction] = None  # let this be the model's ruleset, which we are comparing to gold
        # first, what % of the gold ruleset is present in candidate?
        n_shared_actions = 0
        n_similar_actions = 0  # similar actions get half credit. We count separately so these are stored as int
        # TODO(djwyen) weight "partial credit" based on how similar the effects of the rules are, which can be calculated off distance
        for action in gold:
            similar_actions = manager.action_space.get_similar_actions(action)
            for candidate_act in candidate:
Esempio n. 8
0
class Mcts(PyMcts):
    """Monte Carlo Tree Search class. Everything should be done on cpu except for evaluation.
    Use numpy arrays by default since we can potentially speed up some process through cython
    and parallel processing.
    """

    add_argument('puct_c',
                 default=5.0,
                 dtype=float,
                 msg='Exploration constant.')
    add_argument('virtual_loss',
                 default=1.0,
                 dtype=float,
                 msg='Virtual loss per game.')
    add_argument('game_count',
                 default=3,
                 dtype=int,
                 msg='How many virtual games lost.')
    add_argument('heur_c', default=1.0, dtype=float, msg='Heuristic constant.')
    add_argument('mixing',
                 default=0.5,
                 dtype=float,
                 msg='Mixing lambda hyperparameter.')
    add_argument('num_workers',
                 default=4,
                 dtype=int,
                 msg='Number of workers for parallelizing MCTS.')
    add_argument('dirichlet_alpha',
                 default=0.03,
                 dtype=float,
                 msg='Alpha value for the Dirichlet noise.')
    add_argument('noise_ratio',
                 default=0.25,
                 dtype=float,
                 msg='Mixing ratio for the Dirichlet noise.')
    add_argument('play_strategy',
                 default='max',
                 dtype=str,
                 choices=['max', 'sample_ac', 'sample_mv'],
                 msg='Play strategy.')
    add_argument('exponent',
                 default=1.0,
                 dtype=float,
                 msg='The exponent for sample_ac play strategy.')

    def __init__(self, *args, agent: BasePG = None, **kwargs):
        self.agent = agent
        if g.play_strategy == 'max':
            self.play_strategy = PyPS_MAX
        else:
            self.play_strategy = PyPS_SAMPLE_AC

    def reset(self):
        # Clear priors first and then stats -- stats are needed to speed up clearing.
        self.env.clear_priors(self.env.start, True)
        self.env.clear_stats(self.env.start, True)
        logging.debug(f'#trie nodes {self.env.evict(500000)}')

    def evaluate(self,
                 states,
                 steps: Optional[Union[int, LT]] = None) -> List[float]:
        """Expand and evaluate the leaf node."""
        values = [None] * len(states)
        outstanding_idx = list()
        outstanding_states = list()
        # Deal with end states first.
        for i, state in enumerate(states):
            if state.stopped or state.done:
                # NOTE(j_luo) This value is used for backup. If already reaching the end state, the final reward is either accounted for by the step reward, or by the value network. Therefore, we need to set it to 0.0 here.
                values[i] = 0.0
            else:
                outstanding_idx.append(i)
                outstanding_states.append(state)

        # Collect states that need evaluation.
        if outstanding_states:
            almts1 = almts2 = None
            if g.use_alignment:
                id_seqs, almts1, almts2 = parallel_stack_ids(
                    outstanding_states, g.num_workers, True,
                    self.env.max_end_length)
                almts1 = get_tensor(almts1).rename('batch', 'word', 'pos')
                almts2 = get_tensor(almts2).rename('batch', 'word', 'pos')
            else:
                id_seqs = parallel_stack_ids(outstanding_states, g.num_workers,
                                             False, self.env.max_end_length)
            id_seqs = get_tensor(id_seqs).rename('batch', 'word', 'pos')
            if steps is not None and not isinstance(steps, int):
                steps = steps[outstanding_idx]

            # TODO(j_luo) Scoped might be wrong here.
            # with ScopedCache('state_repr'):
            # NOTE(j_luo) Don't forget to call exp().
            priors = self.agent.get_policy(id_seqs,
                                           almts=(almts1, almts2)).exp()
            with NoName(priors):
                meta_priors = priors[:, [0, 2, 3, 4, 5, 6]].cpu().numpy()
                special_priors = priors[:, 1].cpu().numpy()
            if g.use_value_guidance:
                agent_values = self.agent.get_values(
                    id_seqs, steps=steps).cpu().numpy()
            else:
                agent_values = np.zeros([len(id_seqs)], dtype='float32')

            for i, state, mp, sp, v in zip(outstanding_idx, outstanding_states,
                                           meta_priors, special_priors,
                                           agent_values):
                # NOTE(j_luo) Values should be returned even if states are duplicates or have been visited.
                values[i] = v
                # NOTE(j_luo) Skip duplicate states (due to exploration collapse) or visited states (due to rollout truncation).
                if not state.is_leaf():
                    continue

                # print(mp[1, 111])
                self.env.evaluate(state, mp, sp)
        return values

    def add_noise(self, state: VocabState):
        """Add Dirichlet noise to `state`, usually the root."""
        noise = np.random.dirichlet(
            g.dirichlet_alpha *
            np.ones(7 * len(self.env.abc))).astype('float32')
        noise = noise.reshape(7, -1)
        meta_noise = noise[:6]
        special_noise = noise[6, :6]
        self.env.add_noise(state, meta_noise, special_noise, g.noise_ratio)

    def collect_episodes(self,
                         init_state: VocabState,
                         tracker: Optional[Tracker] = None,
                         num_episodes: int = 0,
                         is_eval: bool = False,
                         no_simulation: bool = False) -> List[Trajectory]:
        trajectories = list()
        self.agent.eval()
        if is_eval:
            self.eval()
        else:
            self.train()
        num_episodes = num_episodes or g.num_episodes
        # if no_simulation:
        #     breakpoint()  # BREAKPOINT(j_luo)
        with self.agent.policy_grad(False), self.agent.value_grad(False):
            for ei in range(num_episodes):
                root = init_state
                self.reset()
                steps = 0 if g.use_finite_horizon else None
                self.evaluate([root], steps=steps)

                # Episodes have max rollout length.
                played_path = None
                for ri in range(g.max_rollout_length):
                    if not is_eval:
                        self.add_noise(root)
                    if is_eval and no_simulation:
                        new_state = self.select_one_pi_step(root)
                        steps = steps + 1 if g.use_finite_horizon else None
                        values = self.evaluate([new_state], steps=steps)
                    else:
                        # Run many simulations before take one action. Simulations take place in batches. Each batch
                        # would be evaluated and expanded after batched selection.
                        num_batches = g.num_mcts_sims // g.expansion_batch_size
                        for _ in range(num_batches):
                            paths, steps = self.select(root,
                                                       g.expansion_batch_size,
                                                       ri,
                                                       g.max_rollout_length,
                                                       played_path)
                            steps = get_tensor(
                                steps) if g.use_finite_horizon else None
                            new_states = [
                                path.get_last_node() for path in paths
                            ]
                            values = self.evaluate(new_states, steps=steps)
                            self.backup(paths, values)
                            if tracker is not None:
                                tracker.update('mcts',
                                               incr=g.expansion_batch_size)
                        if ri == 0 and ei % g.episode_check_interval == 0:
                            k = min(20, root.num_actions)
                            logging.debug(
                                pad_for_log(
                                    str(
                                        get_tensor(
                                            root.action_counts).topk(k))))
                            logging.debug(
                                pad_for_log(str(get_tensor(root.q).topk(k))))
                            logging.debug(
                                pad_for_log(
                                    str(get_tensor(root.max_values).topk(k))))
                    ps = self.play_strategy
                    if is_eval:
                        if no_simulation:
                            ps = PyPS_SAMPLE_AC
                        else:
                            ps = PyPS_MAX
                    new_path = self.play(root, ri, ps, g.exponent)
                    if played_path is None:
                        played_path = new_path
                    else:
                        played_path.merge(new_path)
                    root = played_path.get_last_node()

                    # print('3')
                    if tracker is not None:
                        tracker.update('rollout')
                    if root.stopped or root.done:
                        break
                    # self.show_stats()
                trajectory = Trajectory(played_path, self.env.max_end_length)
                if ei % g.episode_check_interval == 0:
                    logging.debug(pad_for_log(str(trajectory)))

                trajectories.append(trajectory)
                if tracker is not None:
                    tracker.update('episode')
        # if no_simulation:
        #     breakpoint()  # BREAKPOINT(j_luo)

        return trajectories
Esempio n. 9
0
import pandas as pd

from dev_misc import Arg, add_argument, add_check, g
from dev_misc.utils import handle_sequence_inputs
from pypheature.nphthong import Nphthong
from pypheature.process import FeatureProcessor

from .alphabet import Alphabet
from .dataset import OnePairDataset
from .setting import Setting

Lang = NewType('Lang', str)
DF = pd.DataFrame

add_argument('use_stress', dtype=bool, default=True, msg='Flag to use stress.')
add_argument('use_duration', dtype=bool, default=True, msg='Flag to use duration (long or short).')
add_argument('use_diacritics', dtype=bool, default=True, msg='Flag to use diacritics.')
add_argument('use_duplicate_phono', dtype=bool, default=True,
             msg='Whether to keep duplicate symbols based on their phonological features.')
add_argument('noise_level', dtype=float, default=0.0, msg='Noise level on the target side.')
add_argument('stress_included', dtype=bool, default=False,
             msg='Flag to indicate that stress has already been included in the source language.')
add_check(
    (Arg('use_duplicate_phono') == False) | (Arg('separate_output') == True) | (Arg('use_phono_features') == False))


_fp = FeatureProcessor()


@handle_sequence_inputs
Esempio n. 10
0
class DecipherEvaluator(BaseEvaluator):

    add_argument('eval_max_num_samples',
                 default=0,
                 dtype=int,
                 msg='Max number of samples to evaluate on.')

    def __init__(self, model: DecipherModel, dl_reg: DataLoaderRegistry,
                 tasks: Sequence[DecipherTask]):
        self.model = model
        self.dl_reg = dl_reg
        self.tasks = tasks
        self.analyzer = DecipherAnalyzer()

    def evaluate(self, stage: str) -> Metrics:
        metrics = Metrics()
        with torch.no_grad():
            self.model.eval()
            for task in self.tasks:
                dl = self.dl_reg[task]
                task_metrics = self._evaluate_one_data_loader(dl, stage)
                metrics += task_metrics.with_prefix_(task)
        return metrics

    def _evaluate_one_data_loader(self, dl: ContinuousTextDataLoader,
                                  stage: stage) -> Metrics:
        task = dl.task
        accum_metrics = Metrics()

        # Get all metrics from batches.
        dfs = list()
        total_num_samples = 0
        for batch in dl:
            if g.eval_max_num_samples and total_num_samples + batch.batch_size > g.eval_max_num_samples:
                logging.imp(
                    f'Stopping at {total_num_samples} < {g.eval_max_num_samples} evaluated examples from {task}.'
                )
                break

            model_ret = self.model(batch)

            batch_metrics, batch_df = self.predict(model_ret, batch)
            accum_metrics += batch_metrics
            # accum_metrics += self.analyzer.analyze(model_ret, batch)
            total_num_samples += batch.batch_size
            dfs.append(batch_df)

        df = pd.concat(dfs, axis=0)
        # Write the predictions to file.
        out_path = g.log_dir / 'predictions' / f'{task}.{stage}.tsv'
        out_path.parent.mkdir(exist_ok=True, parents=True)
        df.to_csv(out_path, index=None, sep='\t')

        # Compute P/R/F scores.
        accum_metrics += get_prf_scores(accum_metrics)
        return accum_metrics

    def _get_predictions(self, model_ret: DecipherModelReturn,
                         batch: ContinuousIpaBatch) -> List[Segmentation]:
        label_log_probs = model_ret.probs.label_log_probs.align_to(
            'batch', 'length', 'label')
        _, tag_seqs = label_log_probs.max(dim='label')
        tag_seqs = tag_seqs.align_to('batch', 'sample', 'length').int()
        lengths = batch.lengths.align_to('batch', 'sample').int()
        segment_list = None
        if self.model.vocab is not None:
            segment_list = [segment.segment_list for segment in batch.segments]
        packed_words = self.model.pack(tag_seqs,
                                       lengths,
                                       batch.feat_matrix,
                                       batch.segments,
                                       segment_list=segment_list)
        segments_by_batch = packed_words.sampled_segments_by_batch
        # Only take the first (and only) sample.
        predictions = [segments[0] for segments in segments_by_batch]
        return predictions

    def predict(self, model_ret: DecipherModelReturn,
                batch: ContinuousIpaBatch) -> Tuple[Metrics, pd.DataFrame]:
        metrics = Metrics()
        predictions = self._get_predictions(model_ret, batch)
        ground_truths = [
            segment.to_segmentation() for segment in batch.segments
        ]
        matching_stats = get_matching_stats(predictions, ground_truths)
        metrics += matching_stats

        df = _get_df(batch.segments, ground_truths, predictions)

        return metrics, df
Esempio n. 11
0
    def __hash__(self):
        return hash(tuple(self.segment_list))


class BaseSegmentWithGoldTagSeq(BaseSegment):

    has_gold_tag_seq: ClassVar[bool] = True

    @property
    @abstractmethod
    def gold_tag_seq(self) -> LT:
        ...


add_argument('min_word_length',
             default=4,
             dtype=int,
             msg='Min length of words.')


class Segment(BaseSegmentWithGoldTagSeq):
    def __init__(self, raw_token: str):
        self._raw_token = raw_token
        self.is_noise = raw_token.startswith('#')
        self.token = raw_token[1:] if self.is_noise else raw_token
        self.ipa = get_string(self.token)
        self._merged = False
        if len(self.ipa) == 0:
            raise ValueError('Invalid IPA string.')
        self._apply_all()
        self._merge()
        self._indexify()
Esempio n. 12
0
class LstmDecoder(nn.Module, BaseBeamSearcher):
    """A decoder that unrolls the LSTM decoding procedure by steps."""

    add_argument('input_feeding',
                 default=False,
                 dtype=bool,
                 msg='Flag to use input feeding.')

    def __init__(self,
                 char_emb: CharEmbedding,
                 cell: MultiLayerLSTMCell,
                 attn: GlobalAttention,
                 hidden: nn.Linear,
                 nc_residual: NormControlledResidual,
                 dropout: float = 0.0):
        super().__init__()
        self.char_emb = char_emb
        self.cell = cell
        self.attn = attn
        self.hidden = hidden
        self.nc_residual = nc_residual
        self.drop = nn.Dropout(dropout)

    @classmethod
    def from_params(cls,
                    dec_params: DecParams,
                    embedding: Optional[CharEmbedding] = None) -> LstmDecoder:
        emb_params = dec_params.emb_params
        lstm_params = dec_params.lstm_params
        if emb_params is None and embedding is None:
            raise ValueError(
                'Must specify either `emb_params` or `embedding`.')

        char_emb = get_embedding(
            emb_params) if embedding is None else embedding
        cell = MultiLayerLSTMCell.from_params(lstm_params)
        attn = GlobalAttention(dec_params.src_hidden_size,
                               dec_params.tgt_hidden_size)
        hidden = nn.Linear(
            dec_params.src_hidden_size + dec_params.tgt_hidden_size,
            dec_params.tgt_hidden_size)
        nc_residual = NormControlledResidual(
            norms_or_ratios=dec_params.norms_or_ratios,
            control_mode=dec_params.control_mode)

        return LstmDecoder(char_emb, cell, attn, hidden, nc_residual,
                           lstm_params.dropout)

    def forward(self,
                sot_id: int,
                src_emb: FT,
                src_outputs: FT,
                mask_src: BT,
                max_length: Optional[int] = None,
                target: Optional[LT] = None,
                lang_emb: Optional[FT] = None) -> Tuple[FT, FT]:
        # Prepare inputs.
        max_length = self._get_max_length(max_length, target)
        batch_size = mask_src.size('batch')
        input_ = self._prepare_first_input(sot_id, batch_size, mask_src.device)
        prev_att = get_zeros(batch_size,
                             g.hidden_size) if g.input_feeding else None
        state = LstmStatesByLayers.zero_state(self.cell.num_layers,
                                              batch_size,
                                              self.attn.input_tgt_size,
                                              bidirectional=False)

        # Main loop.
        log_probs = list()
        almt_distrs = list()
        with ScopedCache('Wh_s'):
            for l in range(max_length):
                state, log_prob, almt_distr, prev_att = self._forward_step(
                    input_,
                    src_emb,
                    state,
                    src_outputs,
                    mask_src,
                    lang_emb=lang_emb,
                    prev_att=prev_att)
                if target is None:
                    input_ = log_prob.max(dim=-1)[1].rename('batch')
                else:
                    input_ = target[l]

                log_probs.append(log_prob)
                almt_distrs.append(almt_distr)

        # Prepare outputs.
        with NoName(*log_probs), NoName(*almt_distrs):
            log_probs = torch.stack(log_probs).rename('pos', 'batch', 'unit')
            almt_distrs = torch.stack(almt_distrs).rename(
                'tgt_pos', 'batch', 'src_pos')
        return log_probs, almt_distrs

    def _get_max_length(self, max_length: Optional[int],
                        target: Optional[LT]) -> int:
        if self.training:
            assert target is not None
            assert target.names[1] == 'batch'
            assert len(target.shape) == 2
        if max_length is None:
            max_length = target.size("pos")
        return max_length

    def _prepare_first_input(self, sot_id: int, batch_size: int,
                             device: torch.device) -> FT:
        input_ = torch.full([batch_size], sot_id,
                            dtype=torch.long).rename('batch').to(device)
        return input_

    def _forward_step(self,
                      input_: LT,
                      src_emb: FT,
                      state: LstmStatesByLayers,
                      src_states: FT,
                      mask_src: BT,
                      lang_emb: Optional[FT] = None,
                      prev_att: Optional[FT] = None) -> Tuple[FT, FT, FT, FT]:
        emb = self.char_emb(input_)
        if lang_emb is not None:
            emb = emb + lang_emb
        inp = torch.cat([emb, prev_att], dim=-1) if g.input_feeding else emb
        hid_rnn, next_state = self.cell(
            inp, state)  # hid_rnn has gone through dropout already.
        almt, ctx = self.attn.forward(hid_rnn, src_states,
                                      mask_src)  # So has src_states.
        with NoName(hid_rnn, ctx):
            cat = torch.cat([hid_rnn, ctx], dim=-1)
        hid_cat = self.hidden(cat)
        hid_cat = self.drop(hid_cat)

        with NoName(src_emb, hid_cat, almt):
            ctx_emb = (src_emb * almt.t().unsqueeze(dim=-1)).sum(dim=0)
            hid_res = self.nc_residual(ctx_emb,
                                       hid_cat).rename('batch', 'hidden')

        logit = self.char_emb.project(hid_res)
        log_prob = logit.log_softmax(dim=-1).refine_names('batch', 'unit')

        return next_state, log_prob, almt, hid_res

    def is_finished(self, beam: Beam) -> BT:
        return beam.finished

    def get_next_candidates(self, beam: Beam) -> Candidates:
        nh = NameHelper()

        def collapse_beam(orig, is_lstm_state: bool = False):
            def wrapped(tensor):
                return nh.flatten(tensor, ['batch', 'beam'],
                                  'BB').rename(BB='batch')

            if is_lstm_state:
                return orig.apply(wrapped)
            return wrapped(orig)

        prev_att = collapse_beam(beam.prev_att) if g.input_feeding else None
        state, log_probs, almt, att = self._forward_step(
            collapse_beam(beam.tokens),
            beam.constants.src_emb,
            collapse_beam(beam.lstm_state, is_lstm_state=True),
            beam.constants.src_outputs,
            beam.constants.src_paddings,
            lang_emb=beam.constants.lang_emb,
            prev_att=prev_att)

        def unflatten(orig, is_lstm_state: bool = False):
            def wrapped(tensor):
                return nh.unflatten(tensor.rename(batch='BB'), 'BB',
                                    ['batch', 'beam'])

            if is_lstm_state:
                return orig.apply(wrapped)
            return wrapped(orig)

        log_probs = unflatten(log_probs)
        state = unflatten(state, is_lstm_state=True)
        almt = unflatten(almt)
        att = unflatten(att)
        return Candidates(log_probs, state, almt, att)

    def get_next_beam(self, beam: Beam, cand: Candidates) -> Beam:
        nh = NameHelper()

        # Get the new scores. For finished hypotheses, we should keep adding EOT.
        placeholder = torch.full_like(cand.log_probs, -9999.9)
        placeholder[..., EOT_ID] = 0.0
        new_scores = torch.where(beam.finished.align_as(placeholder),
                                 placeholder, cand.log_probs)
        accum = new_scores + beam.accum_scores.align_as(cand.log_probs)
        lp = nh.flatten(accum, ['beam', 'unit'], 'BU')
        top_s, top_i = torch.topk(lp, beam.beam_size, dim='BU')
        num_units = accum.size('unit')
        beam_i = top_i // num_units
        tokens = top_i % num_units

        batch_i = get_named_range(beam.batch_size, 'batch')
        batch_i = batch_i.align_as(top_i)

        def retrieve(tensor, last_name: str = 'hidden') -> torch.Tensor:
            with NoName(tensor, batch_i, beam_i):
                ret = tensor[batch_i, beam_i]
            new_names = ('batch', 'beam')
            if last_name:
                new_names += (last_name, )
            return ret.refine_names(*new_names)

        next_scores = top_s.rename(BU='beam')
        next_tokens = tokens.rename(BU='beam')
        next_beam_ids = beam_i.rename(BU='beam')
        next_state = cand.state.apply(retrieve)
        next_almt = retrieve(cand.almt, last_name='tgt_pos')
        next_att = retrieve(cand.att,
                            last_name='hidden') if g.input_feeding else None
        last_finished = retrieve(beam.finished, last_name=None)
        this_ended = next_tokens == EOT_ID
        reached_max = (beam.step + 1 == beam.constants.max_lengths)
        next_finished = last_finished | this_ended | reached_max
        next_beam = beam.follow(next_finished,
                                next_scores,
                                next_tokens,
                                next_state,
                                next_beam_ids,
                                next_almt,
                                prev_att=next_att)
        return next_beam

    def search(self,
               sot_id: int,
               src_emb: FT,
               src_outputs: FT,
               src_paddings: BT,
               src_lengths: LT,
               beam_size: int,
               lang_emb: Optional[FT] = None) -> Hypotheses:
        if beam_size <= 0:
            raise ValueError(f'`beam_size` must be positive.')

        batch_size = src_emb.size('batch')
        tokens = torch.full([batch_size, beam_size], sot_id,
                            dtype=torch.long).to(src_emb.device).rename(
                                'batch', 'beam')
        accum_scores = torch.full_like(tokens, -9999.9).float()
        accum_scores[:, 0] = 0.0
        init_att = None
        if g.input_feeding:
            init_att = get_zeros(batch_size, beam_size,
                                 g.hidden_size).rename('batch', 'beam',
                                                       'hidden')
        lstm_state = LstmStatesByLayers.zero_state(
            self.cell.num_layers,
            batch_size,
            beam_size,
            self.attn.input_tgt_size,
            bidirectional=False,
            names=['batch', 'beam', 'hidden'])

        def expand_beam(orig, collapse: bool = True):
            if collapse:
                return torch.repeat_interleave(orig, beam_size, dim='batch')
            else:
                return duplicate(orig, 'batch', beam_size, 'beam')

        src_emb = expand_beam(src_emb)
        src_outputs = expand_beam(src_outputs)
        src_paddings = expand_beam(src_paddings)
        max_lengths = (src_lengths.float() * 1.5).long()
        max_lengths = expand_beam(max_lengths, collapse=False)
        constants = BeamConstant(src_emb,
                                 src_outputs,
                                 src_paddings,
                                 max_lengths,
                                 lang_emb=lang_emb)
        init_beam = Beam(0,
                         accum_scores,
                         tokens,
                         lstm_state,
                         constants,
                         prev_att=init_att)
        hyps = super().search(init_beam)
        return hyps

    def get_hypotheses(self, final_beam: Beam) -> Hypotheses:
        btb = final_beam.trace_back('tokens', 'almt')
        tokens = btb['tokens']
        almt = btb['almt']
        return Hypotheses(tokens, almt, final_beam.accum_scores)
Esempio n. 13
0
class BaseModel(nn.Module):

    add_argument('char_emb_size',
                 default=256,
                 dtype=int,
                 msg='Embedding size for characters (as input).')
    add_argument('hidden_size',
                 default=256,
                 dtype=int,
                 msg='Hidden size for LSTM states.')
    add_argument('num_layers',
                 default=1,
                 dtype=int,
                 msg='Number of LSTM layers.')
    add_argument('dropout', default=0.2, dtype=float, msg='Dropout rate.')
    add_argument(
        'norms_or_ratios',
        default=(1.0, 0.2),
        nargs=2,
        dtype=float,
        msg='Norms or ratios of norms for the norm-controlled residual module.'
    )
    add_argument('control_mode',
                 default='relative',
                 dtype=str,
                 choices=['relative', 'absolute', 'none'],
                 msg='Control mode for the norm-controlled residual module.')
    add_argument('model_encoder_type',
                 dtype=str,
                 default='lstm',
                 choices=['lstm', 'cnn'],
                 msg='Which encoder to use.')
    add_argument(
        'kernel_sizes',
        dtype=int,
        nargs='+',
        default=(3, 5, 7),
        msg=
        'What kernel sizes to use for the CNN Encoder (can include repeats).')
    add_argument('beam_size', dtype=int, default=1, msg='Beam size.')
    add_argument(
        'separate_output',
        dtype=bool,
        default=False,
        msg='Flag to use a separate set of params for output embeddings.')

    def __init__(self,
                 num_src_chars: int,
                 num_tgt_chars: int,
                 phono_feat_mat: Optional[LT] = None,
                 special_ids: Optional[Sequence[int]] = None):

        super().__init__()

        def get_lstm_params(input_size: int,
                            bidirectional: bool) -> LstmParams:
            return LstmParams(input_size,
                              g.hidden_size,
                              g.num_layers,
                              g.dropout,
                              bidirectional=bidirectional)

        def get_emb_params_inner(num_chars: int):
            return get_emb_params(num_chars, phono_feat_mat, special_ids)

        enc_emb_params = get_emb_params_inner(num_src_chars)
        if g.model_encoder_type == 'lstm':
            enc_lstm_params = get_lstm_params(g.char_emb_size, True)
            self.encoder = LstmEncoder.from_params(enc_emb_params,
                                                   enc_lstm_params)
        else:
            cnn_params = CnnParams(g.hidden_size, g.kernel_sizes, g.dropout)
            self.encoder = CnnEncoder.from_params(enc_emb_params, cnn_params)

        if g.share_src_tgt_abc:
            dec_emb_params = None
            dec_embedding = self.encoder.embedding
        else:
            dec_emb_params = get_emb_params_inner(num_tgt_chars)
            dec_embedding = None
        # NOTE(j_luo) Input size is the sum of `g.char_emb_size` and `g.hidden_size` if input feeding is used.
        dec_input_size = g.char_emb_size + (g.hidden_size
                                            if g.input_feeding else 0)
        dec_lstm_params = get_lstm_params(dec_input_size, False)
        dec_params = DecParams(
            dec_lstm_params,
            g.hidden_size * 2,  # Bidirectional outputs.
            g.hidden_size,
            g.norms_or_ratios,
            g.control_mode,
            emb_params=dec_emb_params)
        self.decoder = LstmDecoder.from_params(dec_params,
                                               embedding=dec_embedding)

    def forward(self,
                batch: OnePairBatch,
                use_target: bool = True,
                max_length: int = None) -> Tuple[FT, FT]:
        src_emb, (output, state) = self.encoder(batch.src_seqs.ids,
                                                batch.src_seqs.lengths)
        target = batch.tgt_seqs.ids if use_target else None
        lang_emb = self._prepare_lang_emb(batch)
        log_probs, almt_distrs = self.decoder(SOT_ID,
                                              src_emb,
                                              output,
                                              batch.src_seqs.paddings,
                                              max_length=max_length,
                                              target=target,
                                              lang_emb=lang_emb)
        return log_probs, almt_distrs

    def get_scores(self,
                   batch: OnePairBatch,
                   tgt_vocab_seqs: PaddedUnitSeqs,
                   chunk_size: int = 100) -> FT:
        """Given a batch and a list of target tokens (provided as id sequences), return scores produced by the model."""
        src_emb, (output, state) = self.encoder(batch.src_seqs.ids,
                                                batch.src_seqs.lengths)
        src_emb = src_emb.refine_names('pos', 'batch', 'src_emb')
        output = output.refine_names('pos', 'batch', 'output')
        batch_size = src_emb.size('batch')
        lang_emb = self._prepare_lang_emb(batch)

        def create_chunk(size, base, old_chunk, interleave: bool = True):
            if not interleave:
                return base.repeat(1, batch_size)

            if old_chunk is not None and old_chunk.size(
                    'batch') == batch_size * size:
                return old_chunk

            new_chunk = torch.repeat_interleave(base, size, dim='batch')
            return new_chunk

        chunk_src_emb = None
        chunk_output = None
        chunk_src_paddings = None
        scores = list()
        for split in pbar(tgt_vocab_seqs.split(chunk_size),
                          desc='Get scores: chunk'):
            split: PaddedUnitSeqs
            bs_split = len(split)
            chunk_src_emb = create_chunk(bs_split, src_emb, chunk_src_emb)
            chunk_output = create_chunk(bs_split, output, chunk_output)
            chunk_src_paddings = create_chunk(bs_split,
                                              batch.src_seqs.paddings,
                                              chunk_src_paddings)
            chunk_target = create_chunk(None,
                                        split.ids,
                                        None,
                                        interleave=False)
            chunk_tgt_paddings = create_chunk(None,
                                              split.paddings,
                                              None,
                                              interleave=False)
            chunk_log_probs, _ = self.decoder(SOT_ID,
                                              chunk_src_emb,
                                              chunk_output,
                                              chunk_src_paddings,
                                              target=chunk_target,
                                              lang_emb=lang_emb)
            chunk_scores = chunk_log_probs.gather('unit', chunk_target)
            chunk_scores = (chunk_scores * chunk_tgt_paddings).sum('pos')
            with NoName(chunk_scores):
                scores.append(
                    chunk_scores.view(batch_size, bs_split).refine_names(
                        'batch', 'tgt_vocab'))
        scores = torch.cat(scores, dim='tgt_vocab')
        return scores

    def predict(self, batch: Union[SourceOnlyBatch,
                                   OnePairBatch]) -> Hypotheses:
        src_emb, (output, state) = self.encoder(batch.src_seqs.ids,
                                                batch.src_seqs.lengths)
        src_emb = src_emb.refine_names('pos', 'batch', 'src_emb')
        output = output.refine_names('pos', 'batch', 'output')

        lang_emb = self._prepare_lang_emb(batch)
        hyps = self.decoder.search(SOT_ID,
                                   src_emb,
                                   output,
                                   batch.src_seqs.paddings,
                                   batch.src_seqs.lengths,
                                   g.beam_size,
                                   lang_emb=lang_emb)
        return hyps

    @abstractmethod
    def _prepare_lang_emb(self, batch: OnePairBatch) -> FT:
        ...
Esempio n. 14
0
            if interpret_matching:
                gold_id, cand_ids = match
                gold_block = gold[gold_id]
                cand_rules = [cand[j] if j > -1 else None for j in cand_ids]
                cost = objective.GetCoefficient(v[name])
                print('---')
                print('gold block', gold_id, ':', gold_block)
                print('matched to rules:', cand_rules)
                print('with dist', str(cost))

    return matching, status, final_value, max_cost, size_cnt


if __name__ == "__main__":
    add_argument("match_proportion",
                 dtype=float,
                 default=.7,
                 msg="Proportion of gold blocks to force matches on")
    add_argument("k_matches",
                 dtype=int,
                 default=10,
                 msg="Number of matches to consider per gold block")
    add_argument("interpret_matching",
                 dtype=bool,
                 default=False,
                 msg="Flag to print out the rule matching")
    add_argument('cand_path',
                 dtype=str,
                 default='data/toy_cand_rules.txt',
                 msg='Path to the candidate rule file.')
    add_argument('out_path', dtype=str, msg='File to write the results to.')
    add_argument('max_power_set_size',
Esempio n. 15
0
class ExtractEvaluator(BaseEvaluator):

    add_argument(
        'matched_threshold',
        default=0.99,
        dtype=float,
        msg='Value of threshold to determine whether two words are matched.')

    def __init__(self, model: ExtractModel, dl: ContinuousTextDataLoader):
        self.model = model
        self.dl = dl
        self.analyzer = ExtractAnalyzer()

    def evaluate(self, stage: str) -> Metrics:
        segments = list()
        predictions = list()
        ground_truths = list()
        matched_segments = list()
        total_num_samples = 0
        analyzed_metrics = Metrics()
        for batch in pbar(self.dl, desc='eval_batch'):

            if g.eval_max_num_samples and total_num_samples + batch.batch_size > g.eval_max_num_samples:
                logging.imp(
                    f'Stopping at {total_num_samples} < {g.eval_max_num_samples} evaluated examples.'
                )
                break

            ret = self.model(batch)
            analyzed_metrics += self.analyzer.analyze(ret, batch)

            segments.extend(list(batch.segments))
            segmentations, _matched_segments = self._get_segmentations(
                ret, batch)
            predictions.extend(segmentations)
            matched_segments.extend(_matched_segments)
            ground_truths.extend(
                [segment.to_segmentation() for segment in batch.segments])
            total_num_samples += batch.batch_size

        df = _get_df(segments,
                     ground_truths,
                     predictions,
                     matched_segments,
                     columns=('segment', 'ground_truth', 'prediction',
                              'matched_segment'))
        out_path = g.log_dir / 'predictions' / f'extract.{stage}.tsv'
        out_path.parent.mkdir(exist_ok=True, parents=True)
        df.to_csv(out_path, index=None, sep='\t')
        matching_stats = get_matching_stats(predictions, ground_truths)
        prf_scores = get_prf_scores(matching_stats)
        return analyzed_metrics + matching_stats + prf_scores

    def _get_segmentations(
            self, model_ret: ExtractModelReturn, batch: ContinuousIpaBatch
    ) -> Tuple[List[Segmentation], np.ndarray]:
        # Get the best matched nll.
        start = model_ret.start
        end = model_ret.end
        bmv = model_ret.best_matched_vocab
        bmnll = -model_ret.best_matched_ll
        matched = bmnll < self.model.threshold

        start = start.cpu().numpy()
        end = end.cpu().numpy()
        bmv = bmv.cpu().numpy()
        bmw = self.model.vocab[bmv]  # Best matched word

        segmentations = list()
        matched_segments = list()
        for segment, s, e, m, w in zip(batch.segments, start, end, matched,
                                       bmw):
            spans = list()
            if len(segment) >= g.min_word_length and m:
                span = [segment[i] for i in range(s, e + 1)]
                span = Span('-'.join(span), s, e)
                spans.append(span)
                matched_segments.append(w)
            else:
                matched_segments.append('')
            segmentations.append(Segmentation(spans))
        return segmentations, matched_segments
Esempio n. 16
0
class MctsTrainer(RLTrainer):

    add_argument('num_mcts_sims',
                 default=100,
                 dtype=int,
                 msg='Number of MCTS simulations to run.')
    add_argument('expansion_batch_size',
                 default=10,
                 dtype=int,
                 msg='Batch size for expansion steps.')
    add_argument('mcts_batch_size',
                 default=128,
                 dtype=int,
                 msg='Batch size for optimizing the MCTS agent.')
    add_argument('replay_buffer_size',
                 default=1024,
                 dtype=int,
                 msg='Size for the replay buffer.')
    add_argument('num_episodes',
                 default=10,
                 dtype=int,
                 msg='Number of episodes.')
    add_argument('num_inner_steps',
                 default=10,
                 dtype=int,
                 msg='Number of optimization step per batch.')
    add_argument('episode_check_interval',
                 default=10,
                 dtype=int,
                 msg='Frequency of checking episodes')
    add_argument(
        'tolerance',
        default=5,
        dtype=int,
        msg=
        'Tolerance is the maximum number of epochs without improving best score before early-stoppping.'
    )
    add_argument('regress_lambda',
                 default=0.01,
                 dtype=float,
                 msg='Hyperparameter for regression loss.')
    add_argument('use_value_guidance',
                 default=True,
                 dtype=bool,
                 msg='Flag to use predicted values to guide the search.')
    add_argument('tau',
                 default=0.0,
                 dtype=float,
                 msg='Temperature for sampling episodes.')
    add_argument('improved_player_only',
                 default=False,
                 dtype=bool,
                 msg='Flag to use only improved player between epochs.')

    def __init__(self, *args, mcts: Mcts = None, **kwargs):
        if mcts is None:
            raise TypeError(
                f'Must pass a trajectory collector to initialize this trainer.'
            )

        if g.num_mcts_sims % g.expansion_batch_size > 0:
            raise ValueError(
                f'`expansion_batch_size should divide `num_mcts_sims`.')

        self.mcts = mcts
        super().__init__(*args, **kwargs)
        hparams = dict()
        for k, v in g.as_dict().items():
            if not isinstance(v, (int, float, bool, str)):
                v = str(v)
            hparams[k] = v
        self.metric_writer.add_hparams(hparams, dict())
        self.replay_buffer = ReplayBuffer()
        self.best_metrics = Metrics()
        self._old_state = dict()

    def add_trackables(self):
        super().add_trackables()
        self.tracker.add_count_trackable('tolerance', total=g.tolerance)
        step = self.tracker['step']
        episode = step.add_trackable('episode',
                                     total=g.num_episodes,
                                     endless=True)
        episode.add_trackable('rollout',
                              total=g.max_rollout_length,
                              endless=True)
        episode.add_trackable('mcts', total=g.num_mcts_sims, endless=True)
        step.add_trackable('inner_step', total=g.num_inner_steps, endless=True)

    def evaluate_at_start(self):
        metrics = self.evaluator.evaluate(self.stage, 0)
        self._update_best_score(metrics)
        self.metric_writer.add_metrics(self.best_metrics,
                                       self.tracker['step'].value)

    def _update_best_score(self, eval_metrics: Metrics) -> bool:
        best_score = -99999.9
        try:
            best_score = self.best_metrics.best_score.value
        except AttributeError:
            pass
        new_score = eval_metrics['eval/eval_reward'].value
        logging.info(f'Best score is {max(new_score, best_score):.3f}.')
        if new_score >= best_score:  # The same as the best score is tolerated.
            best_score = new_score
            self.best_metrics = Metrics(Metric('best_score', best_score, 1))
            best_path = g.log_dir / 'best_run'
            with best_path.open('w') as fout:
                fout.write(self.stage)
            return True
        return False

    def should_terminate(self, eval_metrics: Metrics) -> bool:
        if not self._update_best_score(eval_metrics):
            logging.imp('eval_reward has not been improved.')
            self.tracker.update('tolerance')
            if g.improved_player_only:
                logging.imp('Loading old state dict.')
                self.agent.load_state_dict(self._old_state)
        else:
            self.tracker.reset('tolerance')
        self.metric_writer.add_metrics(self.best_metrics,
                                       self.tracker['step'].value)
        return self.tracker.is_finished('tolerance')

    def train_one_step(self, dl: OnePairDataLoader):
        if g.improved_player_only:
            self._old_state = self.agent.state_dict()
        # Collect episodes with the latest agent first.
        new_tr = self.mcts.collect_episodes(self.mcts.env.start, self.tracker)
        # new_tr = self.mcts.collect_episodes(dl.init_state, dl.end_state, self.tracker)
        tr_rew = Metric('reward', sum(tr.rewards.sum() for tr in new_tr),
                        g.num_episodes)
        tr_len = Metric('trajectory_length', sum(map(len, new_tr)),
                        g.num_episodes)
        success = Metric('success', sum(tr.done for tr in new_tr),
                         g.num_episodes)
        metrics = Metrics(tr_rew, tr_len, success)

        # Add these new episodes to the replay buffer.
        for i, tr in enumerate(new_tr, 1):
            global_step = i + self.tracker['step'].value * g.num_episodes
            self.metric_writer.add_scalar('episode_reward',
                                          tr.rewards.sum(),
                                          global_step=global_step)
            self.metric_writer.add_text('trajectory',
                                        str(tr),
                                        global_step=global_step)
            # NOTE(j_luo) Use temperature if it's positive.
            if g.tau > 0.0:
                weight = math.exp(tr.total_reward * 10.0)
            else:
                weight = 1.0

            for tr_edge in tr:
                self.replay_buffer.append(tr_edge, weight)

        # Main loop.
        from torch.optim import SGD, Adam
        optim_cls = Adam if g.optim_cls == 'adam' else SGD
        optim_kwargs = dict()
        if optim_cls == SGD:
            optim_kwargs['momentum'] = 0.9
        self.set_optimizer(optim_cls,
                           lr=g.learning_rate,
                           weight_decay=g.weight_decay,
                           **optim_kwargs)
        with self.agent.policy_grad(True), self.agent.value_grad(True):
            for _ in range(g.num_inner_steps):
                # Get a batch of training trajectories from the replay buffer.
                edge_batch = self.replay_buffer.sample(g.mcts_batch_size)
                # edge_batch = np.random.choice(self.replay_buffer, size=g.mcts_batch_size)
                agent_inputs = AgentInputs.from_edges(
                    edge_batch)  # , self.mcts.env)#, sparse=True)

                self.agent.train()
                self.optimizer.zero_grad()

                policies = self.agent.get_policy(agent_inputs.id_seqs,
                                                 almts=(agent_inputs.almts1,
                                                        agent_inputs.almts2))
                # print(policies[:, 2, self.mcts.env.abc['ẽ']].exp().mean())
                # values = self.agent.get_values(agent_inputs.id_seqs, steps=agent_inputs.steps)
                # breakpoint()  # BREAKPOINT(j_luo)
                with NoName(policies, agent_inputs.permissible_actions):
                    mask = agent_inputs.permissible_actions == SENTINEL_ID
                    pa = agent_inputs.permissible_actions
                    pa = torch.where(mask, torch.zeros_like(pa), pa)
                    logits = policies.gather(2, pa)
                    logits = torch.where(mask,
                                         torch.full_like(logits,
                                                         -9999.9), logits)
                    logits = logits.log_softmax(dim=-1)
                # r_max = agent_inputs.rewards.max()
                # r_min = agent_inputs.rewards.min()
                # weights = (agent_inputs.rewards - r_min) / (r_max - r_min + 1e-8)

                # weights = weights.align_as(pi_ce_losses)
                entropies = (-agent_inputs.mcts_pis *
                             (1e-8 + agent_inputs.mcts_pis).log()).sum(dim=-1)
                pi_ce_losses = (-agent_inputs.mcts_pis *
                                logits).sum(dim=-1) - entropies
                for i in range(7):
                    metrics += Metric(f'entropy_{i}', entropies[:, i].sum(),
                                      g.mcts_batch_size)
                    metrics += Metric(f'pi_ce_los_{i}', pi_ce_losses[:,
                                                                     i].sum(),
                                      g.mcts_batch_size)

                # v_regress_losses = 0.5 * (values - agent_inputs.qs) ** 2

                # pi_ce_loss = Metric('pi_ce_loss', (weights * pi_ce_losses).sum(), g.mcts_batch_size * 7)
                # mini_weights = get_tensor([1.0, 0.1, 1.0, 0.1, 0.1, 0.1, 0.1]).rename('mini').align_as(pi_ce_losses)
                # pi_ce_loss = Metric('pi_ce_loss', (mini_weights * pi_ce_losses).sum(), g.mcts_batch_size * 7)
                pi_ce_loss = Metric('pi_ce_loss', pi_ce_losses.sum(),
                                    g.mcts_batch_size * 7)
                # pi_ce_loss = Metric('pi_ce_loss', pi_ce_losses[:, 0].sum(), g.mcts_batch_size)
                # v_regress_loss = Metric('v_regress_loss', v_regress_losses.sum(), g.mcts_batch_size)
                total_loss = pi_ce_loss.total  # + g.regress_lambda * v_regress_loss.total
                total_loss = Metric('total_loss', total_loss,
                                    g.mcts_batch_size)

                total_loss.mean.backward()

                # Clip gradient norm.
                grad_norm = clip_grad(self.agent.parameters(),
                                      g.mcts_batch_size)
                # metrics += Metrics(total_loss, pi_ce_loss, v_regress_loss, grad_norm)
                metrics += Metrics(total_loss, pi_ce_loss, grad_norm)
                self.optimizer.step()
                self.tracker.update('inner_step')

        return metrics
Esempio n. 17
0
from itertools import product
from typing import (ClassVar, Dict, Iterator, List, Optional, Sequence, Set,
                    Union)

import numpy as np
import torch

import sound_law.rl.trajectory as tr
from dev_misc import BT, add_argument, g, get_tensor, get_zeros
from dev_misc.utils import Singleton, pbar
from sound_law.data.alphabet import (ANY_ID, ANY_S_ID, ANY_UNS_ID, EMP, EMP_ID,
                                     EOT_ID, NULL_ID, SOT_ID, SYL_EOT_ID,
                                     Alphabet)

add_argument('factorize_actions',
             dtype=bool,
             default=False,
             msg='Flag to factorize the action space.')
add_argument('ngram_path', dtype='path', msg='Path to the ngram list.')


@dataclass(eq=True, frozen=True)
class SoundChangeAction:
    """One sound change rule."""

    before_id: int
    after_id: int
    rtype: str
    pre_id: int
    d_pre_id: int
    post_id: int
    d_post_id: int
Esempio n. 18
0
import json
from multiprocessing import set_start_method
from typing import Optional

import numpy as np
import torch
from dev_misc import Initiator, add_argument, g, parse_args
from dev_misc.devlib.named_tensor import patch_named_tensors
from dev_misc.trainlib import set_random_seeds

from sound_law.config import a2c_reg, mcts_reg, reg, s2s_reg
from sound_law.train.manager import OnePairManager, OneToManyManager

add_argument('task',
             dtype=str,
             default='one_pair',
             choices=['one_pair', 'one_to_many'],
             msg='Which task to execute.')
add_argument('use_rl',
             dtype=bool,
             default=False,
             msg='Flag to use RL framework.')
add_argument('use_mcts', dtype=bool, default=False, msg='Flag to use MCTS.')
add_argument('agent',
             dtype=str,
             default='vpg',
             choices=['vpg', 'a2c'],
             msg='RL agent.')


def setup() -> Initiator:
Esempio n. 19
0
from sound_law.rl.env import SoundChangeEnv  # , TrajectoryCollector
from sound_law.rl.mcts import Mcts
# pylint: disable=no-name-in-module
from sound_law.rl.mcts_cpp import (  # pylint: disable=no-name-in-module
    PyActionSpaceOpt, PyEnv, PyEnvOpt, PyMctsOpt, PyWordSpaceOpt)
# pylint: enable=no-name-in-module
from sound_law.rl.trajectory import VocabState
from sound_law.s2s.module import CharEmbedding, EmbParams, PhonoEmbedding
from sound_law.s2s.one_pair import OnePairModel
from sound_law.s2s.one_to_many import OneToManyModel

from .trainer import MctsTrainer, Trainer

# from .trainer import MctsTrainer, PolicyGradientTrainer, Trainer

add_argument('batch_size', default=32, dtype=int, msg='Batch size.')
add_argument('check_interval',
             default=10,
             dtype=int,
             msg='Frequency to check the training progress.')
add_argument('eval_interval',
             default=100,
             dtype=int,
             msg='Frequency to call the evaluator.')
add_argument('save_interval',
             dtype=int,
             msg='Frequency to save the progress and the model.')
add_argument('learning_rate', default=2e-3, dtype=float, msg='Learning rate.')
add_argument('value_learning_rate',
             default=2e-3,
             dtype=float,
Esempio n. 20
0
class SoundChangeEnv(PyEnv):

    tnode_cls = VocabState

    add_argument(f'final_reward', default=1.0, dtype=float, msg='Final reward for reaching the end.')
    add_argument(f'step_penalty', default=0.02, dtype=float, msg='Penalty for each step if not the end state.')

    def __init__(self, *args, abc: Alphabet = None, **kwargs):
        self.abc = abc

        # # Set class variable for `SoundChangeAction` here.
        SoundChangeAction.abc = abc

        # Register unconditional actions first.
        units = [u for u in abc if u not in abc.special_units]

        def register_uncondional_action(u1: str, u2: str, cl: bool = False, gb: bool = False):
            id1 = abc[u1]
            id2 = abc[u2]
            if cl:
                self.register_cl_map(id1, id2)
            elif gb:
                if u1.startswith('i'):
                    self.register_gbj_map(id1, id2)
                else:
                    assert u1.startswith('u')
                    self.register_gbw_map(id1, id2)
            else:
                self.register_permissible_change(id1, id2)

        for u1, u2 in abc.edges:
            register_uncondional_action(u1, u2)
        for u in units:
            register_uncondional_action(u, EMP)
        for u1, u2 in abc.cl_map.items():
            register_uncondional_action(u1, u2, cl=True)
        for u1, u2 in abc.gb_map.items():
            register_uncondional_action(u1, u2, gb=True)

        # self.set_vowel_info(abc.vowel_mask, abc.vowel_base, abc.vowel_stress, abc.stressed_vowel, abc.unstressed_vowel)
        # self.set_glide_info(abc['j'], abc['w'])

    def __call__(self, state: VocabState, best_i: int, action: SoundChangeAction) -> Tuple[VocabState, bool, float]:
        return self.step(state, best_i, action)

    def show_path(self, state: VocabState) -> str:
        out = list()
        for action_id, reward in state.get_path():
            action = self.action_space.get_action(action_id)
            out.append(f'{action}, {reward:.3f}')
        return '(' + ', '.join(out) + ')'

    def apply_action(self, state: VocabState, action: SoundChangeAction) -> VocabState:
        return super().apply_action(state,
                                    action.before_id,
                                    action.after_id,
                                    action.rtype,
                                    action.pre_id,
                                    action.d_pre_id,
                                    action.post_id,
                                    action.d_post_id)

    def apply_block(self, state: VocabState, block: List[SoundChangeAction], strict: bool = False) -> VocabState:
        """Apply action to a block of actions sequentially. If `strict` is `False`, Only raise error if none of the rules apply."""
        curr_state = state
        applied = False
        for action in block:
            try:
                curr_state = self.apply_action(curr_state, action)
            except RuntimeError:
                if strict:
                    raise RuntimeError(f'Using strict mode, some rule in the block does not apply here.')
            else:
                applied = True
        if not applied:
            raise RuntimeError(f'None of the rules in the block applies.')
        return curr_state

    def get_state_edit_dist(self, state1: VocabState, state2: VocabState) -> float:
        return super().get_state_edit_dist(state1, state2)

    def get_num_affected(self, state: VocabState, action: SoundChangeAction) -> int:
        return super().get_num_affected(state,
                                        action.before_id,
                                        action.after_id,
                                        action.rtype,
                                        action.pre_id,
                                        action.d_pre_id,
                                        action.post_id,
                                        action.d_post_id)
Esempio n. 21
0
class OneToManyManager:
    """The manager class for single-source-multiple-target scenarios."""

    add_argument('train_tgt_langs',
                 dtype=str,
                 nargs='+',
                 msg='Target languages used for training.')

    @staticmethod
    def prepare_raw_data(
    ) -> Tuple[List[str], CognateRegistry, Alphabet, Alphabet]:
        """Prepare raw data, including the cognates and the alphabets."""
        # Prepare cognate registry first.
        cr = CognateRegistry()
        all_tgt = sorted([g.tgt_lang] + list(g.train_tgt_langs))
        for tgt in all_tgt:
            cr.add_pair(g.data_path, g.src_lang, tgt)

        # Get alphabets. Note that the target alphabet is based on the union of all target languages, i.e., a shared alphabet for all.
        if g.share_src_tgt_abc:
            src_abc = cr.prepare_alphabet(*(all_tgt + [g.src_lang]))
            tgt_abc = src_abc
        else:
            src_abc = cr.prepare_alphabet(g.src_lang)
            tgt_abc = cr.prepare_alphabet(*all_tgt)

        return all_tgt, cr, src_abc, tgt_abc

    def __init__(self):
        all_tgt, self.cog_reg, self.src_abc, self.tgt_abc = self.prepare_raw_data(
        )

        # Get stats for unseen units.
        stats = self.tgt_abc.stats
        _, test_tgt_path = get_paths(g.data_path, g.src_lang, g.tgt_lang)
        mask = (stats.sum() == stats.loc[test_tgt_path])
        unseen = mask[mask].index.tolist()
        total = len(stats.loc[test_tgt_path].dropna())
        logging.info(
            f'Unseen units ({len(unseen)}/{total}) for {g.tgt_lang} are: {unseen}.'
        )

        # Get language-to-id mappings. Used only for the targets (i.e., decoder side).
        self.lang2id = lang2id = {tgt: i for i, tgt in enumerate(all_tgt)}

        # Get all data loaders.
        self.dl_reg = DataLoaderRegistry()

        def create_setting(name: str,
                           tgt_lang: str,
                           split: Split,
                           for_training: bool,
                           keep_ratio: Optional[float] = None,
                           tgt_sot: bool = False) -> Setting:
            return Setting(name,
                           'one_pair',
                           split,
                           g.src_lang,
                           tgt_lang,
                           for_training,
                           keep_ratio=keep_ratio,
                           tgt_sot=tgt_sot)

        test_setting = create_setting(f'test@{g.tgt_lang}',
                                      g.tgt_lang,
                                      Split('all'),
                                      False,
                                      keep_ratio=g.test_keep_ratio)
        settings: List[Setting] = [test_setting]

        # Get the training languages.
        for train_tgt_lang in g.train_tgt_langs:
            if g.input_format == 'ielex':
                train_split = Split(
                    'train',
                    [1, 2, 3, 4])  # Use the first four folds for training.
                dev_split = Split('dev', [5])  # Use the last fold for dev.
            else:
                train_split = Split('train')
                dev_split = Split('dev')
            train_setting = create_setting(f'train@{train_tgt_lang}',
                                           train_tgt_lang,
                                           train_split,
                                           True,
                                           keep_ratio=g.keep_ratio)
            train_e_setting = create_setting(f'train@{train_tgt_lang}_e',
                                             train_tgt_lang,
                                             train_split,
                                             False,
                                             keep_ratio=g.keep_ratio)
            dev_setting = create_setting(f'dev@{train_tgt_lang}',
                                         train_tgt_lang, dev_split, False)
            test_setting = create_setting(f'test@{train_tgt_lang}',
                                          train_tgt_lang, Split('test'), False)

            settings.extend(
                [train_setting, train_e_setting, dev_setting, test_setting])
        for setting in settings:
            self.dl_reg.register_data_loader(setting,
                                             self.cog_reg,
                                             lang2id=lang2id)

        phono_feat_mat = special_ids = None
        if g.use_phono_features:
            phono_feat_mat = get_tensor(self.src_abc.pfm)
            special_ids = get_tensor(self.src_abc.special_ids)

        self.model = OneToManyModel(len(self.src_abc),
                                    len(self.tgt_abc),
                                    len(g.train_tgt_langs) + 1,
                                    lang2id[g.tgt_lang],
                                    lang2id=lang2id,
                                    phono_feat_mat=phono_feat_mat,
                                    special_ids=special_ids)

        if g.saved_model_path is not None:
            self.model.load_state_dict(
                torch.load(g.saved_model_path,
                           map_location=torch.device('cpu')))
            logging.imp(f'Loaded from {g.saved_model_path}.')
        if has_gpus():
            self.model.cuda()
        logging.info(self.model)

        metric_writer = MetricWriter(g.log_dir, flush_secs=5)

        # NOTE(j_luo) Evaluate on every loader that is not for training.
        eval_dls = self.dl_reg.get_loaders_by_name(
            lambda name: 'train' not in name or '_e' in name)
        self.evaluator = Evaluator(self.model,
                                   eval_dls,
                                   self.tgt_abc,
                                   metric_writer=metric_writer)

        if not g.evaluate_only:
            train_names = [
                f'train@{train_tgt_lang}'
                for train_tgt_lang in g.train_tgt_langs
            ]
            train_settings = [
                self.dl_reg.get_setting_by_name(name) for name in train_names
            ]
            self.trainer = Trainer(self.model,
                                   train_settings, [1.0] * len(train_settings),
                                   'step',
                                   stage_tnames=['step'],
                                   evaluator=self.evaluator,
                                   check_interval=g.check_interval,
                                   eval_interval=g.eval_interval,
                                   save_interval=g.save_interval,
                                   metric_writer=metric_writer)
            if g.saved_model_path is None:
                # self.trainer.init_params('uniform', -0.1, 0.1)
                self.trainer.init_params('xavier_uniform')
            optim_cls = Adam if g.optim_cls == 'adam' else SGD
            self.trainer.set_optimizer(optim_cls, lr=g.learning_rate)

    def run(self):
        if g.evaluate_only:
            # TODO(j_luo) load global_step from saved model.
            self.evaluator.evaluate('evaluate_only', 0)
        else:
            self.trainer.train(self.dl_reg)
Esempio n. 22
0
class ExtractModel(nn.Module):

    add_argument('max_num_words',
                 default=3,
                 dtype=int,
                 msg='Max number of extracted words.')
    add_argument('max_word_length',
                 default=10,
                 dtype=int,
                 msg='Max length of extracted words.')
    add_argument(
        'init_threshold',
        default=0.05,
        dtype=float,
        msg=
        'Initial value of threshold to determine whether two words are matched.'
    )
    add_argument('use_adapt',
                 default=False,
                 dtype=bool,
                 msg='Flag to use adapter layer.')
    add_argument('init_ins_del_cost',
                 default=100,
                 dtype=float,
                 msg='Initial unit cost for insertions and deletions.')
    add_argument('min_ins_del_cost',
                 default=3.5,
                 dtype=float,
                 msg='Initial unit cost for insertions and deletions.')
    add_argument('unextracted_prob',
                 default=0.01,
                 dtype=float,
                 msg='Initial unit cost for insertions and deletions.')
    add_argument('context_weight',
                 default=0.0,
                 dtype=float,
                 msg='Weight for the context probabilities.')
    add_argument('debug',
                 dtype=bool,
                 default=False,
                 msg='Flag to enter debug mode.')

    def __init__(self, lu_size: int):
        super().__init__()

        def _has_proper_length(segment):
            l = len(segment)
            return g.min_word_length <= l <= g.max_word_length

        with open(g.vocab_path, 'r', encoding='utf8') as fin:
            _vocab = set(line.strip() for line in fin)
            segments = [Segment(w) for w in _vocab]
            self.vocab = get_array([
                segment for segment in segments if _has_proper_length(segment)
            ])
            lengths = torch.LongTensor(list(map(len, self.vocab)))
            feat_matrix = [segment.feat_matrix for segment in self.vocab]
            feat_matrix = torch.nn.utils.rnn.pad_sequence(feat_matrix,
                                                          batch_first=True)
            max_len = lengths.max().item()
            source_padding = ~get_length_mask(lengths, max_len)
            self.register_buffer('vocab_feat_matrix', feat_matrix)
            self.register_buffer('vocab_source_padding', source_padding)
            self.register_buffer('vocab_length', lengths)
            self.vocab_feat_matrix.rename_('vocab', 'length', 'feat_group')
            self.vocab_source_padding.rename_('vocab', 'length')
            self.vocab_length.rename_('vocab')

            with Rename(self.vocab_feat_matrix, vocab='batch'):
                vocab_dense_feat_matrix = convert_to_dense(
                    self.vocab_feat_matrix)
            self.vocab_dense_feat_matrix = {
                k: v.rename(batch='vocab')
                for k, v in vocab_dense_feat_matrix.items()
            }

            # Get the entire set of units from vocab.
            units = set()
            for segment in self.vocab:
                units.update(segment.segment_list)
            self.id2unit = sorted(units)
            self.unit2id = {u: i for i, u in enumerate(self.id2unit)}
            # Now indexify the vocab. Gather feature matrices for units as well.
            indexed_segments = np.zeros([len(self.vocab), max_len],
                                        dtype='int64')
            unit_feat_matrix = dict()
            for i, segment in enumerate(self.vocab):
                indexed_segments[i, range(len(segment))] = [
                    self.unit2id[u] for u in segment.segment_list
                ]
                for j, u in enumerate(segment.segment_list):
                    if u not in unit_feat_matrix:
                        unit_feat_matrix[u] = segment.feat_matrix[j]
            unit_feat_matrix = [unit_feat_matrix[u] for u in self.id2unit]
            unit_feat_matrix = torch.nn.utils.rnn.pad_sequence(
                unit_feat_matrix, batch_first=True)
            self.register_buffer('unit_feat_matrix',
                                 unit_feat_matrix.unsqueeze(dim=1))
            self.register_buffer('indexed_segments',
                                 torch.from_numpy(indexed_segments))
            # Use dummy length to avoid the trouble later on.
            # HACK(j_luo) Have to provide 'length'.
            self.unit_feat_matrix.rename_('unit', 'length', 'feat_group')
            self.indexed_segments.rename_('vocab', 'length')
            with Rename(self.unit_feat_matrix, unit='batch'):
                unit_dense_feat_matrix = convert_to_dense(
                    self.unit_feat_matrix)
            self.unit_dense_feat_matrix = {
                k: v.rename(batch='unit')
                for k, v in unit_dense_feat_matrix.items()
            }

        self.adapter = AdaptLayer()

        if g.input_format == 'text':
            self.g2p = G2PLayer(lu_size, len(self.id2unit))

    _special_state_keys = [
        'vocab', 'vocab_dense_feat_matrix', 'unit2id', 'id2unit',
        'unit_dense_feat_matrix'
    ]

    def state_dict(self, **kwargs):
        state = super().state_dict(**kwargs)
        for key in self._special_state_keys:
            attr = drop_names(getattr(self, key))
            state[key] = attr
        return state

    def load_state_dict(self, state_dict: Dict, **kwargs):
        with WithholdKeys(state_dict, *self._special_state_keys):
            super().load_state_dict(state_dict, **kwargs)
        # HACK(j_luo) This isn't really terse.
        for key in self._special_state_keys:
            attr = getattr(self, key)
            setattr(self, key, state_dict[key])
            if torch.is_tensor(attr):
                names = attr.names
                getattr(self, key).rename_(*names)
            elif isinstance(attr, dict):
                for k, v in getattr(self, key).items():
                    if torch.is_tensor(v):
                        v.rename_(*attr[k].names)

    # IDEA(j_luo) The current api is worse than just declaring GlobalProperty(writeable=False) outright. And doesn't give proper type hints.
    @global_property
    def threshold(self):
        pass

    @global_property
    def ins_del_cost(self):
        pass

    @cached_property
    def effective_categories(self) -> List[Category]:
        ret = list()
        for cat in Category:
            if should_include(g.feat_groups, cat):
                ret.append(cat)
        return ret

    def forward(self, batch: ExtractBatch) -> ExtractModelReturn:
        """
        The generating story is:
            v
            |
            w
            |
            x -- ww -- theta

        Pr(x) = sum_w Pr(w) Pr(ww)
              = sum_w Pr(w) theta^|ww|
              = sum_{w, v} Pr(w | v) Pr(v) theta^|ww|

        Terminologies:
        matched_: the prefix after selecting v
        score: after multiplication with |w|
        best_: the prefix after selecting w
        """
        # Prepare representations.
        alignment = None
        if g.dense_input:
            # IDEA(j_luo) NoName shouldn't use reveal_name. Just keep the name in the context manager.
            with NoName(*self.unit_dense_feat_matrix.values()):
                unit_repr = torch.cat([
                    self.unit_dense_feat_matrix[cat]
                    for cat in self.effective_categories
                ],
                                      dim=-1)
            unit_repr = unit_repr.rename('batch', 'length',
                                         'char_emb').squeeze(dim='length')

            if g.input_format == 'text':
                ku_char_repr, word_repr = self.g2p(batch.unit_id_seqs,
                                                   unit_repr)
                char_log_probs = (ku_char_repr @ unit_repr.t()).log_softmax(
                    dim=-1)
                alignment = char_log_probs.exp()
            else:
                dfm = batch.dense_feat_matrix
                with Rename(*self.unit_dense_feat_matrix.values(),
                            unit='batch'):
                    adapted_dfm = self.adapter(dfm)
                with NoName(*adapted_dfm.values()):
                    word_repr = torch.cat([
                        adapted_dfm[cat] for cat in self.effective_categories
                    ],
                                          dim=-1)
                word_repr.rename_('batch', 'length', 'char_emb')
        else:
            with Rename(self.unit_feat_matrix, unit='batch'):
                word_repr = self.embedding(batch.feat_matrix,
                                           batch.source_padding)
                unit_repr = self.embedding(self.unit_feat_matrix)
            unit_repr = unit_repr.squeeze('length')
        unit_repr.rename_(batch='unit')

        # Main body: extract one span.
        extracted = Extracted(batch.batch_size)
        new_extracted = self._extract_one_span(batch, extracted, word_repr,
                                               unit_repr, char_log_probs)
        matches = new_extracted.matches
        len_e = matches.ll.size('len_e')
        vs = len(self.vocab)

        # Get the best score and span.
        # NOTE(j_luo) Some segments don't have any viable spans.
        flat_ll = matches.ll.flatten(['len_s', 'len_e', 'vocab'], 'cand')
        flat_viable = new_extracted.viable.expand_as(matches.ll).flatten(
            ['len_s', 'len_e', 'vocab'], 'cand')
        flat_viable_ll = (~flat_viable) * (-9999.9) + flat_ll
        # Add probs for unextracted characters.
        unextracted = batch.lengths.align_as(
            new_extracted.len_candidates) - new_extracted.len_candidates
        unextracted = unextracted.expand_as(matches.ll)
        flat_unextracted = unextracted.flatten(['len_s', 'len_e', 'vocab'],
                                               'cand')
        flat_unextracted_ll = flat_unextracted * math.log(g.unextracted_prob)
        flat_total_ll = flat_viable_ll + flat_unextracted_ll
        # Get the top candiates based on total scores.
        best_matched_ll, best_span_ind = flat_total_ll.max(dim='cand')
        start = best_span_ind // (len_e * vs)
        # NOTE(j_luo) Don't forget the length is off by g.min_word_length - 1.
        end = best_span_ind % (len_e *
                               vs) // vs + start + g.min_word_length - 1
        best_matched_vocab = best_span_ind % vs

        if self.training:
            any_viable = new_extracted.viable.any('len_s').any('len_e')
            best_matched_ll = flat_total_ll.logsumexp(dim='cand')
            best_matched_ll = best_matched_ll * any_viable

        ret = ExtractModelReturn(start, end, best_matched_ll,
                                 best_matched_vocab, new_extracted, alignment)

        return ret

    def _extract_one_span(self, batch: ExtractBatch, extracted: Extracted,
                          word_repr: FT, unit_repr: FT,
                          char_log_probs: FT) -> Extracted:
        # Propose all span start/end positions.
        start_candidates = get_named_range(batch.max_length, 'len_s').align_to(
            'batch', 'len_s', 'len_e')
        # Range from `min_word_length` to `max_word_length`.
        len_candidates = get_named_range(
            g.max_word_length + 1 - g.min_word_length,
            'len_e') + g.min_word_length
        len_candidates = len_candidates.align_to('batch', 'len_s', 'len_e')
        # This is inclusive.
        end_candidates = start_candidates + len_candidates - 1

        # Only keep the viable/valid spans around.
        viable = (end_candidates < batch.lengths.align_as(end_candidates))
        start_candidates = start_candidates.expand_as(viable)
        len_candidates = len_candidates.expand_as(viable)
        # NOTE(j_luo) Use `viable` to get the lengths. `len_candidates` has dummy axes.
        # IDEA(j_luo) Any better way of handling this? Perhaps persistent names?
        len_s = viable.size('len_s')
        len_e = viable.size('len_e')
        bi = get_named_range(batch.batch_size, 'batch').expand_as(viable)
        with NoName(start_candidates, end_candidates, len_candidates, bi,
                    viable):
            viable_starts = start_candidates[viable].rename('viable')
            viable_lens = len_candidates[viable].rename('viable')
            viable_bi = bi[viable].rename('viable')

        # Get the word positions to get the corresponding representations.
        viable_starts = viable_starts.align_to('viable', 'len_w')
        word_pos_offsets = get_named_range(g.max_word_length,
                                           'len_w').align_as(viable_starts)
        word_pos = viable_starts + word_pos_offsets
        word_pos = word_pos.clamp(max=batch.max_length - 1)

        # Get the corresponding representations.
        nh = NameHelper()
        viable_bi = viable_bi.expand_as(word_pos)
        word_pos = nh.flatten(word_pos, ['viable', 'len_w'], 'viable_X_len_w')
        viable_bi = nh.flatten(viable_bi, ['viable', 'len_w'],
                               'viable_X_len_w')
        word_repr = word_repr.align_to('batch', 'length', 'char_emb')
        if g.input_format == 'text':
            with NoName(word_repr, viable_bi, word_pos, batch.unit_id_seqs):
                extracted_word_repr = word_repr[viable_bi, word_pos].rename(
                    'viable_X_len_w', 'char_emb')
                extracted_unit_ids = batch.unit_id_seqs[
                    viable_bi, word_pos].rename('viable_X_len_w')
        else:
            with NoName(word_repr, viable_bi, word_pos):
                extracted_word_repr = word_repr[viable_bi, word_pos].rename(
                    'viable_X_len_w', 'char_emb')
            extracted_unit_ids = None
        extracted_word_repr = nh.unflatten(extracted_word_repr,
                                           'viable_X_len_w',
                                           ['viable', 'len_w'])

        # Main body: Run DP to find the best matches.
        matches = self._get_matches(extracted_word_repr, unit_repr,
                                    viable_lens, extracted_unit_ids,
                                    char_log_probs)
        # Revert to the old shape (so that invalid spans are included).
        bi = get_named_range(batch.batch_size, 'batch').expand_as(viable)
        lsi = get_named_range(len_s, 'len_s').expand_as(viable)
        lei = get_named_range(len_e, 'len_e').expand_as(viable)
        vs = matches.ll.size('vocab')
        # IDEA(j_luo) NoName shouldn't make size() calls unavaiable. Otherwise size() calls have to be moved outside the context. Also the names should be preserved as well.
        with NoName(bi, lsi, lei, viable, matches.ll):
            v_bi = bi[viable]
            v_lsi = lsi[viable]
            v_lei = lei[viable]
            all_ll = get_zeros(batch.batch_size, len_s, len_e, vs)
            all_ll = all_ll.float().fill_(-9999.9)
            all_ll[v_bi, v_lsi, v_lei] = matches.ll
            matches.ll = all_ll.rename('batch', 'len_s', 'len_e', 'vocab')

        new_extracted = Extracted(batch.batch_size, matches, viable,
                                  len_candidates)
        return new_extracted

    def _get_matches(self, extracted_word_repr: FT, unit_repr: FT,
                     viable_lens: LT, extracted_unit_ids: LT,
                     char_log_probs: FT) -> Matches:
        ns = extracted_word_repr.size('viable')
        len_w = extracted_word_repr.size('len_w')
        nt = len(self.vocab_feat_matrix)
        msl = extracted_word_repr.size('len_w')
        mtl = self.vocab_feat_matrix.size('length')

        # Compute cosine distances all at once: for each viable span, compare it against all units.
        ctx_logits = extracted_word_repr @ unit_repr.t()
        ctx_log_probs = ctx_logits.log_softmax(dim='unit').flatten(
            ['viable', 'len_w'], 'viable_X_len_w')
        with NoName(char_log_probs, extracted_unit_ids):
            global_log_probs = char_log_probs[extracted_unit_ids].rename(
                'viable_X_len_w', 'unit')
        weighted_log_probs = g.context_weight * ctx_log_probs + (
            1.0 - g.context_weight) * global_log_probs
        costs = -weighted_log_probs

        # Name: viable x len_w x unit
        costs = costs.unflatten('viable_X_len_w', [('viable', ns),
                                                   ('len_w', len_w)])

        # NOTE(j_luo) Use dictionary to save every state.
        fs = dict()
        for i in range(msl + 1):
            fs[(i, 0)] = get_zeros(ns, nt).fill_(i * self.ins_del_cost)
        for j in range(mtl + 1):
            fs[(0, j)] = get_zeros(ns, nt).fill_(j * self.ins_del_cost)

        # ------------------------ Main body: DP ----------------------- #

        # Transition.
        with NoName(self.indexed_segments, costs):
            for ls in range(1, msl + 1):
                min_lt = max(ls - 2, 1)
                max_lt = min(ls + 2, mtl + 1)
                for lt in range(min_lt, max_lt):
                    transitions = list()
                    if (ls - 1, lt) in fs:
                        transitions.append(fs[(ls - 1, lt)] +
                                           self.ins_del_cost)
                    if (ls, lt - 1) in fs:
                        transitions.append(fs[(ls, lt - 1)] +
                                           self.ins_del_cost)
                    if (ls - 1, lt - 1) in fs:
                        vocab_inds = self.indexed_segments[:, lt - 1]
                        sub_cost = costs[:, ls - 1, vocab_inds]
                        transitions.append(fs[(ls - 1, lt - 1)] + sub_cost)
                    if transitions:
                        all_s = torch.stack(transitions, dim=-1)
                        new_s, _ = all_s.min(dim=-1)
                        fs[(ls, lt)] = new_s

        f_lst = list()
        for i in range(msl + 1):
            for j in range(mtl + 1):
                if (i, j) not in fs:
                    fs[(i, j)] = get_zeros(ns, nt).fill_(9999.9)
                f_lst.append(fs[(i, j)])
        f = torch.stack(f_lst, dim=0).view(msl + 1, mtl + 1, -1,
                                           len(self.vocab))
        f.rename_('len_w_src', 'len_w_tgt', 'viable', 'vocab')

        # Get the values wanted.
        with NoName(f, viable_lens, self.vocab_length):
            idx_src = viable_lens.unsqueeze(dim=-1)
            idx_tgt = self.vocab_length
            viable_i = get_range(ns, 2, 0)
            vocab_i = get_range(len(self.vocab_length), 2, 1)
            nll = f[idx_src, idx_tgt, viable_i, vocab_i]
            nll.rename_('viable', 'vocab')

        # Get the best spans.
        matches = Matches(-nll, f)
        return matches
Esempio n. 23
0
def simulate(
    raw_inputs: Optional[List[Tuple[List[str], List[str], List[str]]]] = None
) -> Tuple[OnePairManager, List[SoundChangeAction], List[PlainState],
           List[str]]:
    add_argument("in_path",
                 dtype=str,
                 msg="Input path to the saved path file.")
    # Get alphabet and action space.
    initiator = setup()
    initiator.run()
    manager = OnePairManager()

    dump = pickle.load(open(g.segments_dump_path, 'rb'))
    _fp.load_repository(dump['proto_ph_map'].keys())

    # Get the list of rules.
    gold = list()
    if raw_inputs is not None:
        for ri in raw_inputs:
            gold.extend(get_actions(*ri))
    elif g.in_path:
        if str(g.in_path).endswith('tsv'):
            lines = pd.read_csv(g.in_path,
                                sep='\t')['action'].values  # type: ignore
        else:
            with open(g.in_path, 'r', encoding='utf8') as fin:
                lines = [line.strip() for line in fin.readlines()]
        gold = get_actions(lines)
    else:
        df = pd.read_csv('data/test_annotations.csv')
        df = df.dropna(subset=['ref no.'])
        for ref in ref_no[g.tgt_lang]:
            rows = df[df['ref no.'].str.startswith(ref)]
            gold.extend(
                get_actions(rows['rule'],
                            orders=rows['order'],
                            refs=rows['ref no.']))

    # Simulate the actions and get the distance.
    PlainState.env = manager.env
    PlainState.end_state = PlainState(manager.env.end)
    PlainState.abc = manager.tgt_abc
    state = PlainState(manager.env.start)
    states = [state]
    actions = list()
    refs = list()
    expanded_gold = list()

    # def test(s1, s2):
    #     seq1 = [PlainState.abc[u] for u in s1.split()]
    #     seq2 = [PlainState.abc[u] for u in s2.split()]
    #     return PlainState.env.get_edit_dist(seq1, seq2)

    logging.info(f"Starting dist: {state.dist:.3f}")
    for hr in gold:
        if hr.expandable:
            action_q = hr.specialize(state)
            logging.warning(f"This is an expandable rule: {hr}")
        else:
            action_q = [hr.to_action()]
        for action in action_q:
            logging.info(f"Applying {action}")
            state = state.apply_action(action)
            states.append(state)
            actions.append(action)
            refs.append(hr.ref)
            expanded_gold.append(action)
            logging.info(f"New dist: {state.dist:.3f}")

    # NOTE(j_luo) We can only score based on expanded rules.
    gold = expanded_gold
    return manager, gold, states, refs