Ejemplo n.º 1
0
    def generate_inflections(self, q, lemma):

        paths = fst.compose(q, self.lattice)
        two_state = fst.compose(paths, self.refiner)
        output = two_state.project(project_output=True)
        output.rmepsilon()
        output = fst.determinize(output)
        output.minimize()
        # read fst out
        dist = []
        labels = []
        state = output.start()
        for arc in output.arcs(state):
            label = self.input_syms.find(arc.ilabel)
            pr = float(arc.weight)
            dist.append(math.e**(-pr))
            labels.append(label)
        sum_value = sum(dist)
        norm_dist = [prob / sum_value for prob in dist]
        relabels = []
        inf_d = {}
        #for label in labels:
        for label, dist in zip(labels, norm_dist):
            delete, insert = label.split("_")
            l = len(delete)
            label = "".join(lemma[:-l]) + insert
            relabels.append(label)
            inf_d[label] = dist
        return inf_d
Ejemplo n.º 2
0
 def generate_inflections(self,q, lemma):
     
     paths = fst.compose(q, self.lattice)
     two_state = fst.compose(paths, self.refiner)
     output = two_state.project(project_output=True)
     output.rmepsilon()
     output = fst.determinize(output)
     output.minimize()
     # read fst out
     dist = []
     labels = []
     state = output.start()
     for arc in output.arcs(state):
         label = self.input_syms.find(arc.ilabel)
         pr = float(arc.weight)
         dist.append(pr)
         labels.append(label)
     sum_value = sum(dist)
     norm_dist = [prob/sum_value for prob in dist]
     relabels = []
     for label in labels:
         delete, insert = label.split("_")
         l = len(delete)
         label = lemma[:-l]+insert
         #print(lemma, delete, insert, label)
         relabels.append(label)
     return str(sorted(zip(relabels, norm_dist), key=lambda x:x[1]))
def build_lm(dev_fname, isyms_fname, constraints, lattice_output, refiner_fname):
    """
    Make a lattice that maps
    lemmas and constraints (or priors) to 
    an inflected version
    """
    # rewrite constraints
    constraints = constraints.replace("_",";")
    
    # read isyms
    input_syms = fst.SymbolTable.read_text(isyms_fname)
    s_fin = '</s>'
    code = {}
    for ltr, c in input_syms:
        code[c]=ltr

    # init the lattice
    f_big = fst.Fst()
    f_big.set_input_symbols(input_syms)
    f_big.set_output_symbols(input_syms)

    for line in open(dev_fname,'r').readlines():
        cns, lemma, inflection = line.split()[-3:]
        if cns == constraints:
            print(cns, lemma, inflection)
            # find idx that the strings diverge
            idx = 0
            for i, (lm, flc) in enumerate(zip(lemma, inflection)):
                if lm !=flc:
                    idx = i
                    break
            f, old= create_priors(cns, input_syms, input_syms, code)
            keep = old
            for j in range(idx,len(lemma)):            
                new = f.add_state()
                f.add_arc(old, fst.Arc(code[lemma[j]], code[lemma[j]], fst.Weight(f.weight_type(), 1.0), new))
                old = new
            new = f.add_state()
            # the residual of the lemma is mapped to the inflection residual (indirectly)
            sym = lemma[idx:]+"_"+inflection[idx:]
            print(lemma, inflection, sym)
            f.add_arc(old, fst.Arc(code[sym], code[s_fin], fst.Weight(f.weight_type(), 1.0), new))
            #f.add_arc(old, fst.Arc(code[inflection[idx:]], code[s_fin], fst.Weight(f.weight_type(), 1.0), new))
            #f.add_arc(old, fst.Arc(code[s_fin], code[inflection[idx:]], fst.Weight(f.weight_type(), 1.0), new))
            f.set_final(new)
            f_big.union(f)
            f_big = fst.determinize(f_big.rmepsilon())

    # add <sigma> state in the <sigma place holder>
    for c, ltr in code.items():
        if int(ltr)>1 and int(ltr)<36: # (hard coded) symbols of Runssian + 2 more
            f_big.add_arc(keep, fst.Arc(code[c], code[c], fst.Weight(f_big.weight_type(), 1.0), keep))

    f_big.invert()
    # save lattice
    f_big.write(lattice_output)
Ejemplo n.º 4
0
    def from_vocab(cls, vocab, tokenizer):
        fst = openfst.Fst()

        def add_word(word):
            i_words = tokenizer.token2idx(word) + [tokenizer.space_idx]
            if not fst.num_states():
                initial_state = fst.add_state()
                assert initial_state == 0
                fst.set_start(initial_state)

            source_state = fst.start()
            dest_state = None
            for i in i_words:
                # The initial state of FST is state 0, hence the index of chars in
                # the FST should start from 1 to avoid the conflict with the initial
                # state, otherwise wrong decoding results would be given.
                i += 1

                dest_state = fst.add_state()
                fst.add_arc(source_state, openfst.Arc(i, i, 0, dest_state))
                source_state = dest_state

            fst.set_final(dest_state, openfst.Weight.One('tropical'))

        lexicon_size = 0
        for word in vocab:
            add_word(word)
            lexicon_size += 1

        # This gets rid of "epsilon" transitions in the FST.
        # These are transitions that don't require a string input to be taken.
        # Getting rid of them is necessary to make the FST determinisitc, but
        # can greatly increase the size of the FST
        fst.rmepsilon()

        # This makes the FST deterministic, meaning for any string input there's
        # only one possible state the FST could be in.  It is assumed our
        # dictionary is deterministic when using it.
        # (lest we'd have to check for multiple transitions at each state)
        fst = openfst.determinize(fst)

        # Finds the simplest equivalent fst. This is unnecessary but decreases
        # memory usage of the dictionary
        fst.minimize()

        return cls(fst_path=None, fst=fst)
Ejemplo n.º 5
0
    def make_graph(self,
                   lexicon_file,
                   graphemes_file,
                   grammar_file,
                   sil_symbol,
                   disambig_graphemes_file='graphemes_disambig.txt',
                   words_file='words.txt',
                   graph_file='LG.fst'):
        """build decode graph and write to disk

        Args:
            lexicon_file: lexicon file from word to graphemes sequence
            graphemes_file: graphemes id file
            grammar_file: arpa language model file
            disambig_graphemes_file: ouput graphemes table from grapheme to id
                isymbols_table for result WFST
            words_file: output words table from word to id 
                osymbols_table for result WFST
            graph_file: write result graph to graph_file, default: LG.fst
        """

        if self.graph_type == 'TLG':
            L = self.lexicon_builder(lexicon_file, graphemes_file, sil_symbol)
            logging.info('grapheme should be  phones or syllables')
            logging.info(
                'please confirm the sil symbol is "SIL" or some other you defined'
            )
        elif self.graph_type == 'LG':
            L = self.lexicon_builder(lexicon_file, graphemes_file, sil_symbol)
            logging.info('grapheme should be characters')
            logging.info(
                'please confirm the sil symbol is <space> or some other you defined'
            )
        self.lexicon_builder.write_words_table(words_file)
        self.lexicon_builder.write_disambig_graphemes_table(
            disambig_graphemes_file)
        logging.info('make lexicon fst successfully')

        G = self.grammar_builder(grammar_file, words_file)
        logging.info('make grammar fst successfully')

        LG = fst.compose(L, G)
        logging.info('LG compose successfully')
        LG = fst.determinize(LG)
        logging.info('LG determinize successfully')
        LG.minimize()
        logging.info('LG minimize successfully')
        LG.arcsort(sort_type='ilabel')

        if self.graph_type == 'TLG':
            T = self.token_builder(disambig_graphemes_file)  # 指定 blank
            self.graph = fst.compose(T, LG)
            logging.info('TLG compose successfully')
            remove_unk_arc(self.graph, self.lexicon_builder.unk_ids)
            logging.info('TLG fst remove unk successfully')

        elif self.graph_type == 'LG':
            self.graph = LG
            remove_unk_arc(self.graph, self.lexicon_builder.unk_ids)
            logging.info('LG fst remove unk successfully')
            remove_disambig_symbol(self.graph,
                                   self.lexicon_builder.disambig_ids)
            logging.info('LG fst remove disambig  successfully')

        self.graph.arcsort(sort_type='ilabel')
        self.graph.write(graph_file)
        logging.info('write graph successfully')
        return self.graph
Ejemplo n.º 6
0
def build_lm(dev_fname, isyms_fname, constraints, lattice_output):
    """
    Make a lattice that maps
    lemmas and constraints (or priors) to 
    an inflected version
    """
    # rewrite constraints
    constraints = constraints.replace("_", ";")

    # read isyms
    input_syms = fst.SymbolTable.read_text(isyms_fname)
    s_fin = '</s>'
    code = {}
    for ltr, c in input_syms:
        code[c] = ltr

    # init the lattice
    f_big = fst.Fst("log")
    f_big.set_input_symbols(input_syms)
    f_big.set_output_symbols(input_syms)

    for line in open(dev_fname, 'r').readlines(
    ):  # all possilbe inflections are added, regardless of the prior (applying the prior an make for a more effecifent computation)
        line = line.strip()
        lemma, inflection, cns = line.split("\t")[:-2]
        #print(lemma, inflection, cns)
        if cns == constraints:

            # comparing strings
            idx = 0
            lemma = lemma.split()
            inflection = inflection.split()
            for j, (lm, flc) in enumerate(zip(lemma, inflection)):
                if lm != flc:
                    idx = j
                    break

            f, old = create_priors(cns, input_syms, input_syms, code)
            keep = old
            for j in range(idx, len(lemma)):
                new = f.add_state()
                f.add_arc(
                    old,
                    fst.Arc(code[lemma[j]], code[lemma[j]],
                            fst.Weight(f.weight_type(), 1.0), new))
                old = new
            new = f.add_state()
            # the residual of the lemma is mapped to the inflection residual (indirectly)
            sym = "".join(lemma[idx:]) + "_" + "".join(inflection[idx:])
            f.add_arc(
                old,
                fst.Arc(code[sym], code[s_fin],
                        fst.Weight(f.weight_type(), 1.0), new))
            #f.add_arc(old, fst.Arc(code[inflection[idx:]], code[s_fin], fst.Weight(f.weight_type(), 1.0), new))
            #f.add_arc(old, fst.Arc(code[s_fin], code[inflection[idx:]], fst.Weight(f.weight_type(), 1.0), new))
            f.set_final(new)
            f_big.union(f)
            f_big = fst.determinize(f_big.rmepsilon())

    # add <sigma> state in the <sigma place holder>
    for c, ltr in code.items():
        if int(ltr) > 1 and int(
                ltr) < 51:  # (hard coded) symbols of Runssian + 2 more
            f_big.add_arc(
                keep,
                fst.Arc(code[c], code[c], fst.Weight(f_big.weight_type(), 1.0),
                        keep))

    f_big.invert()
    # save lattice
    f_big.write(lattice_output)