Exemple #1
0
    def create_config(cls, crp, eval_recordings, eval_lm, extra_config,
                      extra_post_config):
        config = rasr.RasrConfig()
        post_config = rasr.RasrConfig()

        config._update(crp.log_config)
        post_config._update(crp.log_post_config)

        config.costa.statistics.corpus = crp.corpus_config
        post_config.costa.statistics.corpus = crp.corpus_post_config

        config.costa.statistics.evaluate_recordings = eval_recordings
        if eval_recordings:
            config.costa.statistics.feature_extraction.file = "audio.flow"

        config.costa.lexical_statistics = crp.lexicon_config is not None
        config.costa.statistics.lexicon = crp.lexicon_config
        post_config.costa.statistics.lexicon = crp.lexicon_post_config

        config.costa.lm_statistics = (crp.language_model_config is not None
                                      and crp.lexicon_config is not None
                                      and eval_lm)
        config.costa.statistics.lm = crp.language_model_config
        post_config.costa.statistics.lm = crp.language_model_post_config

        if extra_config is not None:
            config._update(extra_config)

        if extra_post_config is not None:
            config._update(extra_post_config)

        return config, post_config
Exemple #2
0
    def build_crp(
        self,
        am_args,
        corpus_object,
        concurrent,
        segment_path,
        lexicon_args,
        cart_tree_path=None,
        allophone_file=None,
        lm_args=None,
    ):
        """
        constructs and returns a CommonRasrParameters from the given settings and files
        """
        crp = rasr.CommonRasrParameters()
        rasr.crp_add_default_output(crp)
        crp.acoustic_model_config = am.acoustic_model_config(**am_args)
        rasr.crp_set_corpus(crp, corpus_object)
        crp.concurrent = concurrent
        crp.segment_path = segment_path

        crp.lexicon_config = rasr.RasrConfig()
        crp.lexicon_config.file = lexicon_args["filename"]
        crp.lexicon_config.normalize_pronunciation = lexicon_args[
            "normalize_pronunciation"]

        if "add_from_lexicon" in lexicon_args:
            crp.acoustic_model_config.allophones.add_from_lexicon = lexicon_args[
                "add_from_lexicon"]
        if "add_all" in lexicon_args:
            crp.acoustic_model_config.allophones.add_all = lexicon_args[
                "add_all"]

        if cart_tree_path is not None:
            crp.acoustic_model_config.state_tying.type = "cart"
            crp.acoustic_model_config.state_tying.file = cart_tree_path

        if lm_args is not None:
            crp.language_model_config = rasr.RasrConfig()
            crp.language_model_config.type = lm_args["type"]
            crp.language_model_config.file = lm_args["filename"]
            crp.language_model_config.scale = lm_args["scale"]

        if allophone_file is not None:
            crp.acoustic_model_config.allophones.add_from_file = allophone_file

        self.crp = crp
Exemple #3
0
    def lattice_combined_recognition(self):
        for epoch in self.epochs:
            lattice_bundles = []
            epoch_name = "{}_epoch.{}".format(self.name, epoch)
            print("{}: Adding recognition for epoch {}".format(
                self.name, epoch_name))
            for key, segment in self.single_segments.items():
                eval_corpus = copy.deepcopy(self.corpus)
                eval_corpus.out_segment_path = segment
                eval_corpus.concurrent = 1

                bundled_flow = self.recog_args["feature_flow"]
                bundled_flow.flags["cache_mode"] = "bundle"
                scorer_name = "{}_{}".format(epoch_name, key)

                returnn_scorer = rasr.ReturnnScorer(
                    feature_dimension=self.scorer_args["feature_dimension"],
                    output_dimension=self.scorer_args["output_dimension"],
                    prior_mixtures=self.scorer_args["prior_mixtures"],
                    model=self.models[key][epoch],
                    prior_scale=self.scorer_args["prior_scale"],
                    prior_file=None,
                )

                eval_corpus.language_model_config.scale = self.recog_args[
                    "lm_scale"]
                model_combination_config = rasr.RasrConfig()
                model_combination_config.pronunciation_scale = self.recog_args[
                    "pronunciation_scale"]

                rec = recog.AdvancedTreeSearchJob(
                    crp=eval_corpus,
                    feature_flow=bundled_flow,
                    feature_scorer=returnn_scorer,
                    model_combination_config=model_combination_config,
                )
                rec.keep_value(self.recognition_keep_value)
                rec.set_vis_name("Recog %s" % scorer_name)
                self.jobs["recog_%s" % scorer_name] = rec
                lattice_bundles.append(rec.out_lattice_bundle)
            m = MergeFilesJob(lattice_bundles)

            self.jobs["lat2ctm_%s" %
                      epoch_name] = lat2ctm = recog.LatticeToCtmJob(
                          crp=self.corpus,
                          lattice_cache=m.out_file,
                          parallelize=False)
            self.ctm_files["recog_%s" % epoch_name] = lat2ctm.out_ctm_file

            kwargs = copy.deepcopy(self.wer_scorer_args)
            kwargs["hyp"] = lat2ctm.out_ctm_file
            scorer = self.wer_scorer(**kwargs)

            self.jobs["scorer_%s" % epoch_name] = scorer
            self.scorers[epoch_name] = scorer
            tk.register_output("recog_%s.reports" % epoch_name,
                               scorer.report_dir)
            self.add_input(scorer.report_dir)
Exemple #4
0
    def create_config(cls, crp, feature_flow, alignment_options, word_boundaries,
                      label_scorer, align_node_options,
                      extra_config, extra_post_config, **kwargs):
        """
        :param recipe.rasr.csp.CommonSprintParameters csp:
        :param feature_flow:
        :param rasr.FeatureScorer feature_scorer:
        :param dict[str] alignment_options:
        :param bool word_boundaries:
        :param recipe.rasr.LabelScorer label_scorer:
        :param dict[str] align_node_options:
        :param extra_config:
        :param extra_post_config:
        :return: config, post_config
        :rtype: (rasr.SprintConfig, rasr.SprintConfig)
        """

        alignment_flow = cls.create_flow(feature_flow)
        align_node = 'speech-label-alignment'
        assert label_scorer is not None, 'need label scorer for label aligner'

        # acoustic model + lexicon for the flow nodes
        mapping = { 'corpus'        : 'acoustic-model-trainer.corpus',
                    'lexicon'       : [],
                    'acoustic_model': []
                    }
        for node in alignment_flow.get_node_names_by_filter(align_node):
            mapping['lexicon']       .append('acoustic-model-trainer.aligning-feature-extractor.feature-extraction.%s.model-combination.lexicon'        % node)
            mapping['acoustic_model'].append('acoustic-model-trainer.aligning-feature-extractor.feature-extraction.%s.model-combination.acoustic-model' % node)

        config, post_config = rasr.build_config_from_mapping(crp, mapping, parallelize=True)

        # alignment options for the flow nodes
        alignopt = {}
        if alignment_options is not None:
            alignopt.update(alignment_options)
        for node in alignment_flow.get_node_names_by_filter(align_node):
            node_config = config.acoustic_model_trainer.aligning_feature_extractor.feature_extraction[node]
            # alignment node option
            for k, v in align_node_options.items():
                node_config[k] = v
            # alinger search option
            node_config.aligner = rasr.RasrConfig()
            for k, v in alignopt.items():
                node_config.aligner[k] = v
            # scorer
            label_scorer.apply_config('label-scorer', node_config, node_config)

        alignment_flow.apply_config('acoustic-model-trainer.aligning-feature-extractor.feature-extraction', config, post_config)

        config.action                                                                    = 'dry'
        config.acoustic_model_trainer.aligning_feature_extractor.feature_extraction.file = 'alignment.flow'
        post_config['*'].allow_overwrite                                                 = True

        config._update(extra_config)
        post_config._update(extra_post_config)

        return config, post_config
Exemple #5
0
def acoustic_model_config(
    state_tying="monophone",
    states_per_phone=3,
    state_repetitions=1,
    across_word_model=True,
    early_recombination=False,
    tdp_scale=1.0,
    tdp_transition=(3.0, 0.0, 3.0, 2.0),
    tdp_silence=(0.0, 3.0, "infinity", 6.0),
    tying_type="global",
    nonword_phones="",
    tdp_nonword=(0.0, 3.0, "infinity", 6.0),
):
    config = rasr.RasrConfig()

    config.state_tying.type = state_tying
    config.allophones.add_from_lexicon = True
    config.allophones.add_all = False

    config.hmm.states_per_phone = states_per_phone
    config.hmm.state_repetitions = state_repetitions
    config.hmm.across_word_model = across_word_model
    config.hmm.early_recombination = early_recombination

    config.tdp.scale = tdp_scale

    config.tdp["*"].loop = tdp_transition[0]
    config.tdp["*"].forward = tdp_transition[1]
    config.tdp["*"].skip = tdp_transition[2]
    config.tdp["*"].exit = tdp_transition[3]

    config.tdp.silence.loop = tdp_silence[0]
    config.tdp.silence.forward = tdp_silence[1]
    config.tdp.silence.skip = tdp_silence[2]
    config.tdp.silence.exit = tdp_silence[3]

    config.tdp["entry-m1"].loop = "infinity"
    config.tdp["entry-m2"].loop = "infinity"

    if tying_type == "global-and-nonword":
        config.tdp.tying_type = "global-and-nonword"
        config.tdp.nonword_phones = nonword_phones
        for nw in [0, 1]:
            k = "nonword-%d" % nw
            config.tdp[k].loop = tdp_nonword[0]
            config.tdp[k].forward = tdp_nonword[1]
            config.tdp[k].skip = tdp_nonword[2]
            config.tdp[k].exit = tdp_nonword[3]

    return config
Exemple #6
0
    def _init_lexicon(self, corpus_key: str, filename: Path,
                      normalize_pronunciation: bool, **kwargs):
        """
        TODO: docstring

        :param corpus_key:
        :param filename:
        :param normalize_pronunciation:
        :param kwargs:
        :return:
        """
        self.crp[corpus_key].lexicon_config = rasr.RasrConfig()
        self.crp[corpus_key].lexicon_config.file = filename
        self.crp[
            corpus_key].lexicon_config.normalize_pronunciation = normalize_pronunciation
Exemple #7
0
    def _init_lm(self, corpus_key: str, filename: Path, type: str, scale: int,
                 **kwargs):
        """
        TODO: docstring

        :param corpus_key:
        :param filename:
        :param type:
        :param scale:
        :param kwargs:
        :return:
        """
        self.crp[corpus_key].language_model_config = rasr.RasrConfig()
        self.crp[corpus_key].language_model_config.type = type
        self.crp[corpus_key].language_model_config.file = filename
        self.crp[corpus_key].language_model_config.scale = scale
Exemple #8
0
    def get_tf_flow(
        checkpoint_path: Union[Path, returnn.Checkpoint],
        tf_graph_path: Path,
        returnn_op_path: Path,
        forward_output_layer: str = "output",
        tf_fwd_input_name: str = "tf-fwd-input",
    ):
        """
        Create flow network and config for the tf-fwd node

        :param Path checkpoint_path: RETURNN model checkpoint which should be loaded
        :param Path tf_graph_path: compiled tf graph for the model
        :param Path returnn_op_path: path to native lstm library
        :param str forward_output_layer: name of layer whose output is used
        :param str tf_fwd_input_name: tf flow node input name. see: add_tf_flow_base_flow()
        :rtype: FlowNetwork
        """
        input_name = tf_fwd_input_name

        tf_flow = rasr.FlowNetwork()
        tf_flow.add_input(input_name)
        tf_flow.add_output("features")
        tf_flow.add_param("id")
        tf_fwd = tf_flow.add_node("tensorflow-forward", "tf-fwd",
                                  {"id": "$(id)"})
        tf_flow.link(f"network:{input_name}", tf_fwd + ":input")
        tf_flow.link(tf_fwd + ":log-posteriors", "network:features")

        tf_flow.config = rasr.RasrConfig()

        tf_flow.config[tf_fwd].input_map.info_0.param_name = "input"
        tf_flow.config[
            tf_fwd].input_map.info_0.tensor_name = "extern_data/placeholders/data/data"
        tf_flow.config[tf_fwd].input_map.info_0.seq_length_tensor_name = (
            "extern_data/placeholders/data/data_dim0_size")

        tf_flow.config[tf_fwd].output_map.info_0.param_name = "log-posteriors"
        tf_flow.config[
            tf_fwd].output_map.info_0.tensor_name = f"{forward_output_layer}/output_batch_major"

        tf_flow.config[tf_fwd].loader.type = "meta"
        tf_flow.config[tf_fwd].loader.meta_graph_file = tf_graph_path
        tf_flow.config[tf_fwd].loader.saved_model_file = checkpoint_path

        tf_flow.config[tf_fwd].loader.required_libraries = returnn_op_path

        return tf_flow
Exemple #9
0
 def store_allophones(self, source_corpus, target_corpus="base", **kwargs):
     """
     dump allophones into a file
     :param str source_corpus:
     :param str target_corpus:
     :param kwargs:
     :return:
     """
     self.jobs[target_corpus]["allophones"] = lexicon.StoreAllophonesJob(
         self.crp[source_corpus], **kwargs)
     # noinspection PyUnresolvedReferences
     self.allophone_files[target_corpus] = self.jobs[target_corpus][
         "allophones"].out_allophone_file
     if self.crp[target_corpus].acoustic_model_post_config is None:
         self.crp[
             target_corpus].acoustic_model_post_config = rasr.RasrConfig()
     self.crp[
         target_corpus].acoustic_model_post_config.allophones.add_from_file = self.allophone_files[
             target_corpus]
Exemple #10
0
def recognized_warping_factor_flow(
    feature_net,
    alphas_file,
    mixtures,
    filterbank_node="filterbank",
    amplitude_spectrum_node="amplitude-spectrum",
    omega=0.875,
):
    assert filterbank_node in feature_net.nodes
    assert feature_net.nodes[filterbank_node]["filter"] == "signal-filterbank"
    assert amplitude_spectrum_node in feature_net.nodes

    # copy original net
    net = rasr.FlowNetwork(name=feature_net.name)
    mapping = net.add_net(feature_net)
    net.interconnect_inputs(feature_net, mapping)
    net.interconnect_outputs(feature_net, mapping)

    # remove output for features
    original_feature_outputs = net.get_output_links("features")
    net.unlink(to_name="%s:%s" % (net.name, "features"))

    warped_net, broken_links = feature_net.subnet_from_node(filterbank_node)

    warped_mapping = net.add_net(warped_net)
    net.interconnect_outputs(warped_net, warped_mapping)

    for l in broken_links:
        net.link(mapping[l[0]], warped_mapping[l[1]])

    fbnode = net.nodes[warped_mapping[filterbank_node]]
    fbnode["warping-function"] = "nest(linear-2($input(alpha), %s), %s)" % (
        omega,
        fbnode["warping-function"],
    )

    # energy
    energy = net.add_node("generic-vector-f32-norm", "energy", {"value": 1})
    net.link(mapping[amplitude_spectrum_node], energy)

    convert_energy_to_vector = net.add_node(
        "generic-convert-f32-to-vector-f32", "convert-energy-to-vector"
    )
    net.link(energy, convert_energy_to_vector)

    energy_normalization = net.add_node(
        "signal-normalization",
        "energy-normalization",
        {"type": "divide-by-mean", "length": "infinite", "right": "infinite"},
    )
    net.link(convert_energy_to_vector, energy_normalization)

    convert_energy_to_scalar = net.add_node(
        "generic-convert-vector-f32-to-f32", "convert-energy-vector-to-scalar"
    )
    net.link(energy_normalization, convert_energy_to_scalar)

    energy_sync = net.add_node("generic-synchronization", "energy-sync")
    net.link(convert_energy_to_scalar, energy_sync)
    net.link(original_feature_outputs.pop(), "%s:target" % energy_sync)

    rec = net.add_node(
        "signal-bayes-classification",
        "warping-factor-recognizer",
        {"class-label-file": alphas_file},
    )
    net.link(rec, "%s:alpha" % warped_mapping[filterbank_node])
    net.link(energy_sync, "%s:feature-score-weight" % rec)
    net.link("%s:target" % energy_sync, rec)

    net.config = rasr.RasrConfig()
    net.config[rec].likelihood_function.file = mixtures
    net.config[rec].likelihood_function.feature_scorer_type = "SIMD-diagonal-maximum"

    return net
Exemple #11
0
    def nn_align(
        self,
        name,
        corpus_key,
        flow,
        tf_checkpoint,
        pronunciation_scale,
        alignment_options=None,
        parallelize_conversion=False,
        prefix="",
        **kwargs,
    ):
        """
        :param str name:
        :param str corpus_key:
        :param str|list[str]|tuple[str]|rasr.FlagDependentFlowAttribute flow:
        :param Checkpoint tf_checkpoint:
        :param float pronunciation_scale:
        :param float lm_scale:
        :param bool lm_lookahead:
        :param dict|None lookahead_options:
        :param bool parallelize_conversion:
        :param dict|None lattice_to_ctm_kwargs:
        :param str prefix:
        :param kwargs:
        :return:
        """

        # self.crp[corpus_key].language_model_config.scale = lm_scale
        #self.crp[corpus_key].acoustic_model_config.tdp["*"].skip = 0
        #self.crp[corpus_key].acoustic_model_config.tdp.silence.skip = 0

        model_combination_config = rasr.RasrConfig()
        model_combination_config.pronunciation_scale = pronunciation_scale

        # label tree #
        label_unit = kwargs.pop('label_unit', None)
        assert label_unit, 'label_unit not given'
        label_tree_args = kwargs.pop('label_tree_args', {})
        # label_tree = rasr_experimental.LabelTree(label_unit, **label_tree_args)

        scorer_type = kwargs.pop('label_scorer_type', None)
        assert scorer_type, 'label_scorer_type not given'
        label_scorer_args = kwargs.pop('label_scorer_args', {})
        # add vocab file
        from i6_experiments.users.rossenbach.rasr.vocabulary import GenerateLabelFileFromStateTying
        label_scorer_args['labelFile'] = GenerateLabelFileFromStateTying(
            self.state_tying, add_eow=True).out_label_file
        label_scorer_args['priorFile'] = self.estimate_nn_prior(
            self.train_corpora[0],
            feature_flow=flow,
            tf_checkpoint=tf_checkpoint,
            **kwargs)
        am_scale = label_scorer_args.get('scale', 1.0)

        tf_graph = self.make_model_graph(self.returnn_config)

        feature_flow = self.make_tf_feature_flow(
            self.feature_flows[corpus_key][flow], tf_graph, tf_checkpoint,
            **kwargs)

        label_scorer = rasr_experimental.LabelScorer(scorer_type,
                                                     **label_scorer_args)

        extra_config = rasr.RasrConfig()
        if pronunciation_scale > 0:
            extra_config.flf_lattice_tool.network.recognizer.pronunciation_scale = pronunciation_scale

        # Fixed CTC settings:
        extra_config.acoustic_model_trainer.aligning_feature_extractor.feature_extraction.alignment.allow_label_loop = True

        if alignment_options is None:
            alignment_options = {
                'label-pruning': 10,
                'label-pruning-limit': 10000,
            }

        # label alignment
        align_args = {
            'crp': self.crp[corpus_key],
            'use_gpu': kwargs.get('use_gpu', True),
            'feature_flow': feature_flow,
            'label_scorer': label_scorer,
            'alignment_options': alignment_options,  # aligner search option,
            'extra_config': extra_config,
        }
        align_job = LabelAlignmentJob(**align_args)
        #align_job.rqmt.update(job_rqmt)
        alignment = rasr.FlagDependentFlowAttribute(
            'cache_mode', {
                'task_dependent': align_job.out_alignment_path,
                'bundle': align_job.out_alignment_bundle
            })
        self.alignments[corpus_key][name] = [alignment]
        if kwargs.get('register_output', False):
            tk.register_output('%s_%s' % (corpus_key, name),
                               align_job.out_alignment_bundle)
        return name
Exemple #12
0
    def create_config(cls,
                      crp,
                      feature_flow,
                      label_tree,
                      label_scorer,
                      search_parameters=None,
                      lm_lookahead=True,
                      lookahead_options=None,
                      eval_single_best=True,
                      eval_best_in_lattice=True,
                      extra_config=None,
                      extra_post_config=None,
                      sprint_exe=None,
                      lm_gc_job=None,
                      lm_gc_job_local=False,
                      lm_gc_job_mem=16,
                      lm_gc_job_default_search=False,
                      **kwargs):

        # optional individual lm-image and global-cache job #
        if lm_gc_job is None:
            lm_gc_job = LabelSyncSearchLmImageAndGlobalCacheJob(
                crp,
                label_tree,
                label_scorer,
                extra_config,
                extra_post_config,
                mem=lm_gc_job_mem,
                local_job=lm_gc_job_local,
                sprint_exe=sprint_exe,
                default_search=lm_gc_job_default_search,
            )

        # get config from csp #
        config, post_config = rasr.build_config_from_mapping(
            crp,
            {
                "corpus": "flf-lattice-tool.corpus",
                "lexicon": "flf-lattice-tool.lexicon",
                "acoustic_model":
                "flf-lattice-tool.network.recognizer.acoustic-model",
                "language_model": "flf-lattice-tool.network.recognizer.lm",
            },
            parallelize=True,
        )

        # acoustic model maybe used for allophones and state-tying, but no mixture is needed #
        # skip conventional AM or load it without GMM #
        if crp.acoustic_model_config is None:
            config.flf_lattice_tool.network.recognizer.use_acoustic_model = False
        else:
            config.flf_lattice_tool.network.recognizer.use_mixture = False

        # feature flow #
        config.flf_lattice_tool.network.recognizer.feature_extraction.file = (
            "feature.flow")
        feature_flow.apply_config(
            "flf-lattice-tool.network.recognizer.feature-extraction",
            config,
            post_config,
        )

        # label tree and optional lexicon overwrite #
        label_tree.apply_config(
            "flf-lattice-tool.network.recognizer.recognizer.label-tree",
            config,
            post_config,
        )
        if label_tree.lexicon_config is not None:
            config["flf-lattice-tool.lexicon"]._update(
                label_tree.lexicon_config)

        # label scorer #
        label_scorer.apply_config(
            "flf-lattice-tool.network.recognizer.label-scorer", config,
            post_config)

        # search settings #
        search_config = rasr.RasrConfig()
        if search_parameters is not None:
            for key in search_parameters.keys():
                search_config[key] = search_parameters[key]
        config.flf_lattice_tool.network.recognizer.recognizer._update(
            search_config)

        # lookahead settings #
        config.flf_lattice_tool.network.recognizer.recognizer.lm_lookahead._value = (
            lm_lookahead)
        if lm_lookahead:
            lookahead_config = rasr.RasrConfig()
            if lookahead_options is not None:
                for key in lookahead_options.keys():
                    lookahead_config[key] = lookahead_options[key]
            config.flf_lattice_tool.network.recognizer.recognizer.lm_lookahead._update(
                lookahead_config)

        # flf network #
        config.flf_lattice_tool.network.initial_nodes = "segment"
        config.flf_lattice_tool.network.segment.type = "speech-segment"
        config.flf_lattice_tool.network.segment.links = (
            "1->recognizer:1 0->archive-writer:1 0->evaluator:1")

        config.flf_lattice_tool.network.recognizer.type = "recognizer"
        config.flf_lattice_tool.network.recognizer.search_type = "label-sync-search"
        config.flf_lattice_tool.network.recognizer.apply_non_word_closure_filter = False
        config.flf_lattice_tool.network.recognizer.add_confidence_score = False
        config.flf_lattice_tool.network.recognizer.apply_posterior_pruning = False

        if label_scorer.config.label_unit == "hmm":
            config.flf_lattice_tool.network.recognizer.links = "expand"
            config.flf_lattice_tool.network.expand.type = "expand-transits"
            config.flf_lattice_tool.network.expand.links = "evaluator archive-writer"
        else:
            config.flf_lattice_tool.network.recognizer.links = (
                "evaluator archive-writer")

        config.flf_lattice_tool.network.evaluator.type = "evaluator"
        config.flf_lattice_tool.network.evaluator.links = "sink:0"
        config.flf_lattice_tool.network.evaluator.word_errors = True
        config.flf_lattice_tool.network.evaluator.single_best = eval_single_best
        config.flf_lattice_tool.network.evaluator.best_in_lattice = eval_best_in_lattice
        config.flf_lattice_tool.network.evaluator.edit_distance.format = "bliss"
        config.flf_lattice_tool.network.evaluator.edit_distance.allow_broken_words = (
            False)

        config.flf_lattice_tool.network.archive_writer.type = "archive-writer"
        config.flf_lattice_tool.network.archive_writer.links = "sink:1"
        config.flf_lattice_tool.network.archive_writer.format = "flf"
        config.flf_lattice_tool.network.archive_writer.path = "lattice.cache.$(TASK)"
        post_config.flf_lattice_tool.network.archive_writer.info = True

        config.flf_lattice_tool.network.sink.type = "sink"
        post_config.flf_lattice_tool.network.sink.warn_on_empty_lattice = True
        post_config.flf_lattice_tool.network.sink.error_on_empty_lattice = False
        post_config["*"].output_channel.unbuffered = True

        # update parameters #
        config._update(extra_config)
        post_config._update(extra_post_config)

        # image and cache #
        arpa_lms = LabelSyncSearchLmImageAndGlobalCacheJob.find_arpa_lms(
            config)
        assert (len(arpa_lms) == lm_gc_job.num_images
                ), "mismatch between image-cache config and recognition config"
        for i, lm_config in enumerate(arpa_lms):
            lm_config.image = lm_gc_job.lm_images[i + 1]

        if post_config.flf_lattice_tool.global_cache._get("file") is None:
            post_config.flf_lattice_tool.global_cache.read_only = True
            post_config.flf_lattice_tool.global_cache.file = lm_gc_job.global_cache

        return config, post_config
def get_monophone_args(
    feature_flow: str = "mfcc+deriv+norm",
    *,
    train_align_iter: int = 75,
    allow_zero_weights: bool = False,
    zero_weights_in: str = "extra_config",
):
    linear_alignment_args = {
        "minimum_segment_length": 0,
        "maximum_segment_length": 6000,
        "iterations": 5,
        "penalty": 0,
        "minimum_speech_proportion": 0.7,
        "save_alignment": False,
        "keep_accumulators": False,
        "extra_merge_args": None,
        "extra_config": None,
        "extra_post_config": None,
    }

    monophone_training_args = {
        "name": "mono",
        "feature_flow": feature_flow,
        "feature_energy_flow_key": f"energy,{feature_flow}",
        "align_iter": train_align_iter,
        "splits": 10,
        "accs_per_split": 2,
    }

    monophone_recognition_args = {
        # GmmSystem.recognition() args:
        "iters": [8, 10],
        "lm_scales": [10.5],
        "optimize_am_lm_scale": True,
        # meta.System.recog() args:
        "feature_flow": feature_flow,
        "pronunciation_scales": [6.0],
        "lm_lookahead": True,
        "lookahead_options": None,
        "create_lattice": True,
        "eval_single_best": True,
        "eval_best_in_lattice": True,
        "search_parameters": {
            "beam-pruning": 18.0,
            "beam-pruning-limit": 100000,
            "word-end-pruning": 0.75,
            "word-end-pruning-limit": 15000,
        },
        "parallelize_conversion": False,
        "lattice_to_ctm_kwargs": {
            "fill_empty_segments": False,
            "best_path_algo": "bellman-ford",
        },
        "rtf": 50,
        "mem": 8,
        "use_gpu": False,
    }

    monophone_test_recognition_args = None
    # {
    #    "optimize_am_lm_scale": False,
    #    "pronunciation_scales": [1.0],
    #    "lm_scales": [11.0],
    # }

    if allow_zero_weights:
        allow_zero_weights_extra_config = rasr.RasrConfig()
        allow_zero_weights_extra_config.allow_zero_weights = True

        monophone_training_args["align_extra_args"] = {
            zero_weights_in: allow_zero_weights_extra_config
        }
        monophone_training_args["accumulate_extra_args"] = {
            zero_weights_in: allow_zero_weights_extra_config
        }
        monophone_training_args["split_extra_args"] = {
            zero_weights_in: allow_zero_weights_extra_config
        }
        monophone_recognition_args[
            zero_weights_in] = allow_zero_weights_extra_config

    sdm_args = {
        "name": "sdm.mono",
        "alignment": "train_mono",
        "feature_flow_key": feature_flow,
    }

    return rasr_util.GmmMonophoneArgs(
        linear_alignment_args=linear_alignment_args,
        training_args=monophone_training_args,
        recognition_args=monophone_recognition_args,
        test_recognition_args=monophone_test_recognition_args,
        sdm_args=sdm_args,
    )
def get_triphone_args(
    name: str = "tri",
    initial_alignment: str = "mono",
    feature_flow: str = "mfcc+context+lda",
    allow_zero_weights: bool = False,
    zero_weights_in: str = "extra_config",
):
    triphone_training_args = {
        "name": name,
        "initial_alignment": f"train_{initial_alignment}",
        "feature_flow": feature_flow,
        "splits": 10,
        "accs_per_split": 2,
        "align_extra_rqmt": {
            "mem": 8
        },
        "accumulate_extra_rqmt": {
            "mem": 8
        },
        "split_extra_rqmt": {
            "mem": 8
        },
    }

    triphone_recognition_args = {
        "iters": [8, 10],
        "feature_flow": feature_flow,
        "pronunciation_scales": [6.0],
        "lm_scales": [24.9],
        "lm_lookahead": True,
        "lookahead_options": None,
        "create_lattice": True,
        "eval_single_best": True,
        "eval_best_in_lattice": True,
        "search_parameters": {
            "beam_pruning": 12.0,
            "beam-pruning-limit": 100000,
            "word-end-pruning": 0.5,
            "word-end-pruning-limit": 15000,
        },
        "lattice_to_ctm_kwargs": {
            "fill_empty_segments": False,
            "best_path_algo": "bellman-ford",
        },
        "optimize_am_lm_scale": True,
        "rtf": 50,
        "mem": 8,
        "parallelize_conversion": True,
    }

    if allow_zero_weights:
        allow_zero_weights_extra_config = rasr.RasrConfig()
        allow_zero_weights_extra_config.allow_zero_weights = True

        triphone_training_args["align_extra_args"] = {
            zero_weights_in: allow_zero_weights_extra_config
        }
        triphone_training_args["accumulate_extra_args"] = {
            zero_weights_in: allow_zero_weights_extra_config
        }
        triphone_training_args["split_extra_args"] = {
            zero_weights_in: allow_zero_weights_extra_config
        }
        triphone_recognition_args[
            zero_weights_in] = allow_zero_weights_extra_config

    sdm_args = {
        "name": f"sdm.{name}",
        "alignment": f"train_{name}",
        "feature_flow_key": feature_flow,
    }

    return rasr_util.GmmTriphoneArgs(
        training_args=triphone_training_args,
        recognition_args=triphone_recognition_args,
        sdm_args=sdm_args,
    )
def get_vtln_args(
    name: str = "vtln",
    feature_flow: str = "mfcc+context+lda",
    initial_alignment_key: str = "tri",
    allow_zero_weights: bool = False,
    zero_weights_in: str = "extra_config",
):
    vtln_training_args = {
        "feature_flow": {
            "name": f"uncached_{feature_flow}",
            "lda_matrix_key": "mono",
            "base_flow_key": f"uncached_{feature_flow.split('+')[0]}",
            "context_size": 9,
        },
        "warp_mix": {
            "name": "tri",
            "alignment": f"train_{initial_alignment_key}",
            "feature_scorer": "estimate_mixtures_sdm.tri",
            "splits": 8,
            "accs_per_split": 2,
        },
        "train": {
            "name": name,
            "initial_alignment_key": f"train_{initial_alignment_key}",
            "splits": 10,
            "accs_per_split": 2,
            "feature_flow": f"{feature_flow}+vtln",
            "accumulate_extra_rqmt": {
                "mem": 8
            },
            "align_extra_rqmt": {
                "mem": 8
            },
            "split_extra_rqmt": {
                "mem": 8
            },
        },
    }

    vtln_recognition_args = {
        "iters": [8, 10],
        "feature_flow": f"uncached_{feature_flow}+vtln",
        "pronunciation_scales": [6.0],
        "lm_scales": [22.4],
        "lm_lookahead": True,
        "lookahead_options": None,
        "create_lattice": True,
        "eval_single_best": True,
        "eval_best_in_lattice": True,
        "search_parameters": {
            "beam_pruning": 12.0,
            "beam-pruning-limit": 100000,
            "word-end-pruning": 0.5,
            "word-end-pruning-limit": 15000,
        },
        "lattice_to_ctm_kwargs": {
            "fill_empty_segments": False,
            "best_path_algo": "bellman-ford",
        },
        "optimize_am_lm_scale": True,
        "rtf": 50,
        "mem": 8,
        "parallelize_conversion": True,
    }

    if allow_zero_weights:
        allow_zero_weights_extra_config = rasr.RasrConfig()
        allow_zero_weights_extra_config.allow_zero_weights = True

        vtln_training_args["train"]["align_extra_args"] = {
            zero_weights_in: allow_zero_weights_extra_config
        }
        vtln_training_args["train"]["accumulate_extra_args"] = {
            zero_weights_in: allow_zero_weights_extra_config
        }
        vtln_training_args["train"]["split_extra_args"] = {
            zero_weights_in: allow_zero_weights_extra_config
        }
        vtln_recognition_args[
            zero_weights_in] = allow_zero_weights_extra_config

    sdm_args = {
        "name": f"sdm.{name}",
        "alignment": f"train_{name}",
        "feature_flow_key": f"{feature_flow}+vtln",
    }

    return rasr_util.GmmVtlnArgs(
        training_args=vtln_training_args,
        recognition_args=vtln_recognition_args,
        sdm_args=sdm_args,
    )
Exemple #16
0
def samples_with_silence_normalization_flow(
    audio_format="wav", dc_detection=True, dc_params=None, silence_params=None
):
    _dc_params = {
        "min-dc-length": 0.01,
        "max-dc-increment": 0.9,
        "min-non-dc-segment-length": 0.021,
    }
    _silence_params = {
        "absolute-silence-threshold": 250,
        "discard-unsure-segments": True,
        "min-surrounding-silence": 0.1,
        "fill-up-silence": True,
        "silence-ratio": 0.25,
        "silence-threshold": 0.05,
    }
    if dc_params is not None:
        _dc_params.update(dc_params)
    if silence_params is not None:
        _silence_params.update(silence_params)

    net = rasr.FlowNetwork()

    net.add_output("samples")
    net.add_param(["input-file", "start-time", "end-time", "track"])

    samples = net.add_node(
        "audio-input-file-" + audio_format,
        "samples",
        {
            "file": "$(input-file)",
            "start-time": "$(start-time)",
            "end-time": "$(end-time)",
        },
    )

    demultiplex = net.add_node(
        "generic-vector-s16-demultiplex", "demultiplex", track="$(track)"
    )
    net.link(samples, demultiplex)

    convert = net.add_node("generic-convert-vector-s16-to-vector-f32", "convert")
    net.link(demultiplex, convert)

    sil_norm = net.add_node("signal-silence-normalization", "silence-normalization")
    net.link(convert, sil_norm)
    warp_time = net.add_node("warp-time", "warp-time", {"start-time": "$(start-time)"})
    if dc_detection:
        dc_detection = net.add_node("signal-dc-detection", "dc-detection", _dc_params)
        net.link(sil_norm, dc_detection)
        net.link(dc_detection, warp_time)
    else:
        net.link(sil_norm, warp_time)

    net.link(warp_time, "network:samples")

    net.config = rasr.RasrConfig()
    for k, v in _silence_params:
        net.config[sil_norm][k] = v

    return net
Exemple #17
0
    def recog(
        self,
        name,
        corpus,
        flow,
        feature_scorer,
        pronunciation_scale,
        lm_scale,
        parallelize_conversion=False,
        lattice_to_ctm_kwargs=None,
        prefix="",
        **kwargs,
    ):
        """
        :param str name:
        :param str corpus:
        :param str|list[str]|tuple[str]|rasr.FlagDependentFlowAttribute flow:
        :param str|list[str]|tuple[str]|rasr.FeatureScorer feature_scorer:
        :param float pronunciation_scale:
        :param float lm_scale:
        :param bool parallelize_conversion:
        :param dict lattice_to_ctm_kwargs:
        :param str prefix:
        :param kwargs:
        :return:
        """
        if lattice_to_ctm_kwargs is None:
            lattice_to_ctm_kwargs = {}

        self.crp[corpus].language_model_config.scale = lm_scale
        model_combination_config = rasr.RasrConfig()
        model_combination_config.pronunciation_scale = pronunciation_scale

        rec = recog.AdvancedTreeSearchJob(
            crp=self.crp[corpus],
            feature_flow=select_element(self.feature_flows, corpus, flow),
            feature_scorer=select_element(self.feature_scorers, corpus,
                                          feature_scorer),
            model_combination_config=model_combination_config,
            **kwargs,
        )
        rec.set_vis_name("Recog %s%s" % (prefix, name))
        rec.add_alias("%srecog_%s" % (prefix, name))
        self.jobs[corpus]["recog_%s" % name] = rec

        self.jobs[corpus]["lat2ctm_%s" %
                          name] = lat2ctm = recog.LatticeToCtmJob(
                              crp=self.crp[corpus],
                              lattice_cache=rec.out_lattice_bundle,
                              parallelize=parallelize_conversion,
                              **lattice_to_ctm_kwargs,
                          )
        self.ctm_files[corpus]["recog_%s" % name] = lat2ctm.out_ctm_file

        kwargs = copy.deepcopy(self.scorer_args[corpus])
        kwargs[self.scorer_hyp_arg[corpus]] = lat2ctm.out_ctm_file
        scorer = self.scorers[corpus](**kwargs)

        self.jobs[corpus]["scorer_%s" % name] = scorer
        tk.register_output("%srecog_%s.reports" % (prefix, name),
                           scorer.out_report_dir)
Exemple #18
0
    def recog(
        self,
        name,
        corpus_key,
        flow,
        tf_checkpoint,
        pronunciation_scale,
        lm_scale,
        lm_lookahead,
        lookahead_options=None,
        parallelize_conversion=False,
        lattice_to_ctm_kwargs=None,
        prefix="",
        **kwargs,
    ):
        """
        :param str name:
        :param str corpus_key:
        :param str|list[str]|tuple[str]|rasr.FlagDependentFlowAttribute flow:
        :param Checkpoint tf_checkpoint:
        :param float pronunciation_scale:
        :param float lm_scale:
        :param bool lm_lookahead:
        :param dict|None lookahead_options:
        :param bool parallelize_conversion:
        :param dict|None lattice_to_ctm_kwargs:
        :param str prefix:
        :param kwargs:
        :return:
        """
        if lattice_to_ctm_kwargs is None:
            lattice_to_ctm_kwargs = {}

        self.crp[corpus_key].language_model_config.scale = lm_scale
        self.crp[corpus_key].acoustic_model_config.tdp["*"].skip = 0
        self.crp[corpus_key].acoustic_model_config.tdp.silence.skip = 0

        model_combination_config = rasr.RasrConfig()
        model_combination_config.pronunciation_scale = pronunciation_scale

        # label tree #
        label_unit = kwargs.pop('label_unit', None)
        assert label_unit, 'label_unit not given'
        label_tree_args = kwargs.pop('label_tree_args', {})
        label_tree = rasr_experimental.LabelTree(label_unit, **label_tree_args)

        scorer_type = kwargs.pop('label_scorer_type', None)
        assert scorer_type, 'label_scorer_type not given'
        label_scorer_args = kwargs.pop('label_scorer_args', {})
        # add vocab file
        from i6_experiments.users.rossenbach.rasr.vocabulary import GenerateLabelFileFromStateTying
        label_scorer_args['labelFile'] = GenerateLabelFileFromStateTying(
            self.state_tying, add_eow=True).out_label_file
        label_scorer_args['priorFile'] = self.estimate_nn_prior(
            self.train_corpora[0],
            feature_flow=flow,
            tf_checkpoint=tf_checkpoint,
            **kwargs)
        am_scale = label_scorer_args.get('scale', 1.0)

        tf_graph = self.make_model_graph(self.returnn_config)

        feature_flow = self.make_tf_feature_flow(
            self.feature_flows[corpus_key][flow], tf_graph, tf_checkpoint,
            **kwargs)

        label_scorer = rasr_experimental.LabelScorer(scorer_type,
                                                     **label_scorer_args)

        extra_config = rasr.RasrConfig()
        if pronunciation_scale > 0:
            extra_config.flf_lattice_tool.network.recognizer.pronunciation_scale = pronunciation_scale

        if lm_lookahead:
            assert lookahead_options is not None
            # we want to alter this now
            lookahead_options = copy.deepcopy(lookahead_options)
            if lookahead_options.get("scale", None) is None:
                lookahead_options["scale"] = lm_scale

        # Fixed CTC settings:
        extra_config.flf_lattice_tool.network.recognizer.recognizer.allow_label_loop = True
        extra_config.flf_lattice_tool.network.recognizer.recognizer.allow_blank_label = True

        extra_config.flf_lattice_tool.network.recognizer.recognizer.allow_label_recombination = True
        extra_config.flf_lattice_tool.network.recognizer.recognizer.allow_word_end_recombination = True

        rec = LabelSyncSearchJob(
            crp=self.crp[corpus_key],
            feature_flow=feature_flow,
            label_scorer=label_scorer,
            label_tree=label_tree,
            lm_lookahead=lm_lookahead,
            lookahead_options=lookahead_options,
            extra_config=extra_config,
            **kwargs,
        )
        rec.set_vis_name("Recog %s%s" % (prefix, name))
        rec.add_alias("%srecog_%s" % (prefix, name))
        self.jobs[corpus_key]["recog_%s" % name] = rec

        self.jobs[corpus_key]["lat2ctm_%s" %
                              name] = lat2ctm = recog.LatticeToCtmJob(
                                  crp=self.crp[corpus_key],
                                  lattice_cache=rec.out_lattice_bundle,
                                  parallelize=parallelize_conversion,
                                  **lattice_to_ctm_kwargs,
                              )
        self.ctm_files[corpus_key]["recog_%s" % name] = lat2ctm.out_ctm_file

        kwargs = copy.deepcopy(self.scorer_args[corpus_key])
        kwargs[self.scorer_hyp_arg[corpus_key]] = lat2ctm.out_ctm_file
        scorer = self.scorers[corpus_key](**kwargs)

        self.jobs[corpus_key]["scorer_%s" % name] = scorer
        tk.register_output("%srecog_%s.reports" % (prefix, name),
                           scorer.out_report_dir)
Exemple #19
0
    def create_config(
        cls,
        crp,
        feature_flow,
        feature_scorer,
        alignment_options,
        extra_config,
        extra_post_config,
        **kwargs,
    ):
        alignment_flow = cls.create_flow(feature_flow)

        alignopt = {
            "increase-pruning-until-no-score-difference": True,
            "min-acoustic-pruning": 500,
            "max-acoustic-pruning": 10000,
            "acoustic-pruning-increment-factor": 2,
        }
        if alignment_options is not None:
            alignopt.update(alignment_options)

        mapping = {
            "corpus": "acoustic-model-trainer.corpus",
            "lexicon": [],
            "acoustic_model": [],
        }

        # acoustic model + lexicon for the flow nodes
        for node in alignment_flow.get_node_names_by_filter(
                "speech-alignment"):
            mapping["lexicon"].append(
                "acoustic-model-trainer.aligning-feature-extractor.feature-extraction.%s.model-combination.lexicon"
                % node)
            mapping["acoustic_model"].append(
                "acoustic-model-trainer.aligning-feature-extractor.feature-extraction.%s.model-combination.acoustic-model"
                % node)

        config, post_config = rasr.build_config_from_mapping(crp,
                                                             mapping,
                                                             parallelize=True)

        # alignment options for the flow nodes
        for node in alignment_flow.get_node_names_by_filter(
                "speech-alignment"):
            node_config = config.acoustic_model_trainer.aligning_feature_extractor.feature_extraction[
                node]

            node_config.aligner = rasr.RasrConfig()
            for k, v in alignopt.items():
                node_config.aligner[k] = v
            feature_scorer.apply_config(
                "model-combination.acoustic-model.mixture-set", node_config,
                node_config)

            node_config.store_lattices = True
            node_config.lattice_archive.path = "numerator.$(TASK)"

        alignment_flow.apply_config(
            "acoustic-model-trainer.aligning-feature-extractor.feature-extraction",
            config,
            post_config,
        )

        config.action = "dry"
        config.acoustic_model_trainer.aligning_feature_extractor.feature_extraction.file = (
            "alignment.flow")
        post_config["*"].allow_overwrite = True

        config._update(extra_config)
        post_config._update(extra_post_config)

        return config, post_config
Exemple #20
0
    def create_config(
        cls,
        crp,
        feature_flow,
        feature_scorer,
        alignment_options,
        word_boundaries,
        extra_config,
        extra_post_config,
        **kwargs,
    ):
        """
        :param rasr.crp.CommonRasrParameters crp:
        :param feature_flow:
        :param rasr.FeatureScorer feature_scorer:
        :param dict[str] alignment_options:
        :param bool word_boundaries:
        :param extra_config:
        :param extra_post_config:
        :return: config, post_config
        :rtype: (rasr.RasrConfig, rasr.RasrConfig)
        """
        alignment_flow = cls.create_flow(feature_flow)

        # TODO: think about mode
        alignopt = {
            "increase-pruning-until-no-score-difference": True,
            "min-acoustic-pruning": 500,
            "max-acoustic-pruning": 4000,
            "acoustic-pruning-increment-factor": 2,
        }
        if alignment_options is not None:
            alignopt.update(alignment_options)

        mapping = {
            "corpus": "acoustic-model-trainer.corpus",
            "lexicon": [],
            "acoustic_model": [],
        }

        # acoustic model + lexicon for the flow nodes
        for node in alignment_flow.get_node_names_by_filter(
                "speech-alignment"):
            mapping["lexicon"].append(
                "acoustic-model-trainer.aligning-feature-extractor.feature-extraction.%s.model-combination.lexicon"
                % node)
            mapping["acoustic_model"].append(
                "acoustic-model-trainer.aligning-feature-extractor.feature-extraction.%s.model-combination.acoustic-model"
                % node)

        config, post_config = rasr.build_config_from_mapping(crp,
                                                             mapping,
                                                             parallelize=True)

        # alignment options for the flow nodes
        for node in alignment_flow.get_node_names_by_filter(
                "speech-alignment"):
            node_config = config.acoustic_model_trainer.aligning_feature_extractor.feature_extraction[
                node]

            node_config.aligner = rasr.RasrConfig()
            for k, v in alignopt.items():
                node_config.aligner[k] = v
            feature_scorer.apply_config(
                "model-combination.acoustic-model.mixture-set", node_config,
                node_config)

            if word_boundaries:
                node_config.store_lattices = True
                node_config.lattice_archive.path = "word_boundary.cache.$(TASK)"

        alignment_flow.apply_config(
            "acoustic-model-trainer.aligning-feature-extractor.feature-extraction",
            config,
            post_config,
        )

        config.action = "dry"
        config.acoustic_model_trainer.aligning_feature_extractor.feature_extraction.file = (
            "alignment.flow")
        post_config["*"].allow_overwrite = True

        config._update(extra_config)
        post_config._update(extra_post_config)

        return config, post_config
def get_vtln_sat_args(
    name: str = "vtln+sat",
    feature_flow: str = "mfcc+context+lda+vtln",
    initial_mixture: str = "estimate_mixtures_sdm.vtln",
    initial_alignment: str = "vtln",
    allow_zero_weights: bool = False,
    zero_weights_in: str = "extra_config",
):
    feature_base_cache = feature_flow.split("+")[0]
    vtln_sat_training_args = {
        "name": name,
        "mixtures": initial_mixture,
        "alignment": f"train_{initial_alignment}",
        "feature_cache": feature_flow,
        "feature_flow_key": feature_flow,
        "cache_regex": "^.*\\+vtln$",
        "splits": 10,
        "accs_per_split": 2,
        "accumulate_extra_rqmt": {
            "mem": 8
        },
        "align_extra_rqmt": {
            "mem": 8
        },
        "split_extra_rqmt": {
            "mem": 8
        },
    }

    vtln_sat_recognition_args = {
        "prev_ctm": (
            "vtln",
            6.0,
            22.4,
            10,
            "-optlm",
        ),  # (name, pron_scale, lm_scale, it, opt)
        "feature_cache": feature_base_cache,
        "cache_regex": f"^{feature_base_cache}.*$",
        "cmllr_mixtures": initial_mixture,
        "iters": [8, 10],
        "feature_flow": f"uncached_{feature_flow}",
        "pronunciation_scales": [6.0],
        "lm_scales": [30.0],
        "lm_lookahead": True,
        "lookahead_options": None,
        "create_lattice": True,
        "eval_single_best": True,
        "eval_best_in_lattice": True,
        "search_parameters": {
            "beam_pruning": 12.0,
            "beam-pruning-limit": 100000,
            "word-end-pruning": 0.5,
            "word-end-pruning-limit": 15000,
        },
        "lattice_to_ctm_kwargs": {
            "fill_empty_segments": False,
            "best_path_algo": "bellman-ford",
        },
        "optimize_am_lm_scale": True,
        "rtf": 50,
        "mem": 8,
        "parallelize_conversion": True,
    }

    if allow_zero_weights:
        allow_zero_weights_extra_config = rasr.RasrConfig()
        allow_zero_weights_extra_config.allow_zero_weights = True

        vtln_sat_training_args["align_extra_args"] = {
            zero_weights_in: allow_zero_weights_extra_config
        }
        vtln_sat_training_args["accumulate_extra_args"] = {
            zero_weights_in: allow_zero_weights_extra_config
        }
        vtln_sat_training_args["split_extra_args"] = {
            zero_weights_in: allow_zero_weights_extra_config
        }
        vtln_sat_recognition_args[
            zero_weights_in] = allow_zero_weights_extra_config

    sdm_args = {
        "name": f"sdm.{name}",
        "alignment": f"train_{name}",
        "feature_flow_key": f"{feature_flow}+cmllr",
    }

    return rasr_util.GmmVtlnSatArgs(
        training_args=vtln_sat_training_args,
        recognition_args=vtln_sat_recognition_args,
        sdm_args=sdm_args,
    )
Exemple #22
0
    def create_config(
        cls,
        crp,
        feature_flow,
        feature_scorer,
        search_parameters,
        lm_lookahead,
        lookahead_options,
        mem,
        model_combination_config,
        model_combination_post_config,
        extra_config,
        extra_post_config,
        **kwargs,
    ):

        lm_gc = recognition.AdvancedTreeSearchLmImageAndGlobalCacheJob(
            crp, feature_scorer, extra_config, extra_post_config)
        lm_gc.rqmt["mem"] = mem

        if search_parameters is None:
            search_parameters = {}

        default_search_parameters = {
            "beam-pruning": 15,
            "beam-pruning-limit": 100000,
            "word-end-pruning": 0.5,
            "word-end-pruning-limit": 10000,
        }
        default_search_parameters.update(search_parameters)
        search_parameters = default_search_parameters

        la_opts = {
            "history_limit": 1,
            "tree_cutoff": 30,
            "minimum_representation": 1,
            "cache_low": 2000,
            "cache_high": 3000,
            "laziness": 15,
        }
        if lookahead_options is not None:
            la_opts.update(lookahead_options)

        config, post_config = rasr.build_config_from_mapping(
            crp,
            {
                "corpus": "speech-recognizer.corpus",
                "lexicon": "speech-recognizer.model-combination.lexicon",
                "acoustic_model":
                "speech-recognizer.model-combination.acoustic-model",
                "language_model": "speech-recognizer.model-combination.lm",
            },
            parallelize=True,
        )

        # Parameters for Speech::Recognizer
        config.speech_recognizer.search_type = "advanced-tree-search"

        # Parameters for Speech::DataSource or Sparse::DataSource
        config.speech_recognizer.feature_extraction.file = "feature.flow"
        feature_flow.apply_config("speech-recognizer.feature-extraction",
                                  config, post_config)

        # Parameters for Am::ClassicAcousticModel
        feature_scorer.apply_config(
            "speech-recognizer.model-combination.acoustic-model.mixture-set",
            config,
            post_config,
        )

        # Parameters for Speech::Model combination (besides AM and LM parameters)
        config.speech_recognizer.model_combination.pronunciation_scale = 3.0
        config.speech_recognizer.model_combination._update(
            model_combination_config)
        post_config.speech_recognizer.model_combination._update(
            model_combination_post_config)

        # Search parameters
        config.speech_recognizer.recognizer.create_lattice = True
        config.speech_recognizer.store_lattices = True

        config.speech_recognizer.recognizer.beam_pruning = search_parameters[
            "beam-pruning"]
        config.speech_recognizer.recognizer.beam_pruning_limit = search_parameters[
            "beam-pruning-limit"]
        config.speech_recognizer.recognizer.word_end_pruning = search_parameters[
            "word-end-pruning"]
        config.speech_recognizer.recognizer.word_end_pruning_limit = search_parameters[
            "word-end-pruning-limit"]

        config.speech_recognizer.recognizer.lm_lookahead = rasr.RasrConfig()
        config.speech_recognizer.recognizer.lm_lookahead._value = lm_lookahead
        config.speech_recognizer.recognizer.optimize_lattice = "simple"
        if lm_lookahead:
            config.speech_recognizer.recognizer.lm_lookahead_laziness = la_opts[
                "laziness"]
            config.speech_recognizer.recognizer.lm_lookahead.history_limit = la_opts[
                "history_limit"]
            config.speech_recognizer.recognizer.lm_lookahead.tree_cutoff = la_opts[
                "tree_cutoff"]
            config.speech_recognizer.recognizer.lm_lookahead.minimum_representation = (
                la_opts["minimum_representation"])
            post_config.speech_recognizer.recognizer.lm_lookahead.cache_size_low = (
                la_opts["cache_low"])
            post_config.speech_recognizer.recognizer.lm_lookahead.cache_size_high = (
                la_opts["cache_high"])

        post_config.speech_recognizer.global_cache.read_only = True
        post_config.speech_recognizer.global_cache.file = lm_gc.out_global_cache
        post_config.speech_recognizer.model_combination.lm.image = lm_gc.lm_image

        # Lattice writer options
        config.speech_recognizer.lattice_archive.path = "raw-denominator.$(TASK)"
        post_config.speech_recognizer.lattice_archive.info = True

        config._update(extra_config)
        post_config._update(extra_post_config)

        return config, post_config
Exemple #23
0
    def make_tf_feature_flow(self, feature_flow, tf_graph, tf_checkpoint,
                             **kwargs):
        """
        :param feature_flow:
        :param Path tf_graph
        :param Checkpoint tf_checkpoint:
        :param kwargs:
        :return:
        """

        # tf flow (model scoring done in tf flow node) #
        tf_flow = rasr.FlowNetwork()
        tf_flow.add_input("input-features")
        tf_flow.add_output("features")
        tf_flow.add_param("id")

        tf_fwd = tf_flow.add_node("tensorflow-forward", "tf-fwd",
                                  {"id": "$(id)"})
        tf_flow.link("network:input-features", tf_fwd + ":features")
        tf_flow.link(tf_fwd + ":log-posteriors", "network:features")

        tf_flow.config = rasr.RasrConfig()
        tf_flow.config[tf_fwd].input_map.info_0.param_name = "features"
        tf_flow.config[
            tf_fwd].input_map.info_0.tensor_name = "extern_data/placeholders/data/data"
        tf_flow.config[tf_fwd].input_map.info_0.seq_length_tensor_name = (
            "extern_data/placeholders/data/data_dim0_size")

        tf_flow.config[tf_fwd].output_map.info_0.param_name = "log-posteriors"
        tf_flow.config[tf_fwd].output_map.info_0.tensor_name = kwargs.get(
            "output_tensor_name", "output/output_batch_major")

        from sisyphus.delayed_ops import DelayedFunction
        tf_flow.config[tf_fwd].loader.type = "meta"
        tf_flow.config[tf_fwd].loader.meta_graph_file = tf_graph
        #tf_flow.config[tf_fwd].loader.saved_model_file = tf_checkpoint.get_delayed_checkpoint_path()
        tf_flow.config[tf_fwd].loader.saved_model_file = tf_checkpoint

        # TODO: HACK
        from i6_core.returnn.compile import CompileNativeOpJob

        # DO NOT USE BLAS ON I6, THIS WILL SLOW DOWN RECOGNITION ON OPTERON MACHNIES BY FACTOR 4
        native_op = CompileNativeOpJob(
            "NativeLstm2",
            returnn_python_exe=self.recognition_args.compile_exec
            or self.defalt_training_args['returnn_python_exe'],
            returnn_root=self.defalt_training_args['returnn_root'],
            # blas_lib=tk.Path(gs.BLAS_LIB, hash_overwrite="BLAS_LIB")).out_op,
            blas_lib=self.recognition_args.blas_lib,
            search_numpy_blas=False).out_op

        tf_flow.config[tf_fwd].loader.required_libraries = native_op

        # interconnect flows #
        tf_feature_flow = rasr.FlowNetwork()
        base_mapping = tf_feature_flow.add_net(feature_flow)
        tf_mapping = tf_feature_flow.add_net(tf_flow)
        tf_feature_flow.interconnect_inputs(feature_flow, base_mapping)
        tf_feature_flow.interconnect(
            feature_flow,
            base_mapping,
            tf_flow,
            tf_mapping,
            {"features": "input-features"},
        )

        if kwargs.get("append", False):
            concat = tf_feature_flow.add_node(
                "generic-vector-f32-concat",
                "concat",
                attr={"timestamp-port": "features"},
            )
            tf_feature_flow.link(
                tf_mapping[tf_flow.get_output_links("features").pop()],
                concat + ":tf")
            tf_feature_flow.link(
                base_mapping[feature_flow.get_output_links("features").pop()],
                concat + ":features",
            )
            tf_feature_flow.add_output("features")
            tf_feature_flow.link(concat, "network:features")
        else:
            tf_feature_flow.interconnect_outputs(tf_flow, tf_mapping)
        # ensure cache_mode as base feature net
        tf_feature_flow.add_flags(feature_flow.flags)
        return tf_feature_flow