Example #1
0
    def add_start_and_end_of_sentence_symbols(fst_1):
        """
        Concatenate start (beginning) and end (end) of sentence symbols to the FST.
        :param fst_1: FST object
        :return: FST with prepended start of sentence symbol and appended end of sentence symbol.
        """

        # Create start of sentence FSA
        # 1 is start of sentence label
        start_of_sentence = fst.Transducer()
        start_of_sentence.add_arc(0, 1, 0, 1)
        start_of_sentence[1].final = True

        # Create end of sentence FSA
        # 2 is end of sentence label
        end_of_sentence = fst.Transducer()
        end_of_sentence.add_arc(0, 1, 0, 2)
        end_of_sentence[1].final = True

        # Modify start_of_sentence by concatenating fst_1
        start_of_sentence.concatenate(fst_1)

        # Modify joint start_of_sentence and fst_1 by concatenating end_of_sentence
        start_of_sentence.concatenate(end_of_sentence)

        return start_of_sentence
Example #2
0
def Transducer(isyms=None, osyms=None, semiring=semiring):
    global syms
    if isyms is None:
        isyms = syms
    if osyms is None:
        osyms = syms
    return fst.Transducer(isyms=isyms, osyms=osyms, semiring=semiring)
Example #3
0
def create_wordlist_fst (words):
    "This takes a list of words and creates a letter-to-word transducer for all of the words (unioned together)."
    wordset=fst.Transducer();
    for word in words:
        wordfst=gen_word_fst(word,isyms=wordset.isyms,osyms=wordset.osyms)
        wordset=wordset|wordfst
    return wordset
Example #4
0
    def get_replace_transducer(self):
        transducer_symbol_table = SegmentTable().transducer_symbol_table
        inner_replace_transducer = fst.Transducer(
            isyms=transducer_symbol_table, osyms=transducer_symbol_table)
        for segment1, segment2 in self.target_change_tuples_list:
            inner_replace_transducer.add_arc(0, 1, segment1, segment2)
        inner_replace_transducer[1].final = True
        inner_replace_transducer_ignore_brackets = [
            LEFT_CENTER_BRACKET, RIGHT_CENTER_BRACKET
        ]

        for bracket in inner_replace_transducer_ignore_brackets:
            inner_replace_transducer.add_arc(0, 0, bracket, bracket)
            inner_replace_transducer.add_arc(1, 1, bracket, bracket)

        opt_part = left_bracket_transducer + inner_replace_transducer + right_bracket_transducer
        add_opt(opt_part)

        sigma_star_regex = "({})*".format("+".join(self.alphabet))
        sigma_star_dfa = get_dfa_from_regex(sigma_star_regex,
                                            sigma=self.alphabet)
        sigma_star_dfa_ignore_identity = get_ignore_dfa(
            self.alphabet
            | set([LEFT_IDENTITY_BRACKET, RIGHT_IDENTITY_BRACKET]),
            sigma_star_dfa,
            set([LEFT_IDENTITY_BRACKET, RIGHT_IDENTITY_BRACKET]))
        id_sigma_star = pyfst_from_dfa(sigma_star_dfa_ignore_identity)

        concat_transducer = id_sigma_star + opt_part
        replace_transducer = concat_transducer.closure()
        # dot(replace_transducer, "replace_transducer")
        return replace_transducer
def fstbuild(words):
    trie = fst.Transducer()

    letter_syms = fst.read_symbols("ascii.syms.bin")
    trie.isyms = letter_syms
    trie.osyms = letter_syms

    def bs(s):
        letter_syms = fst.read_symbols("ascii.syms.bin")
        return letter_syms[s]

    biggest = 0

    for w in words:
        p = 0
        c = 0
        trie.add_arc(p, biggest + 1, w[c], "<epsilon>", 0)
        p = biggest + 1
        c += 1
        while (c < len(w) - 1):
            trie.add_arc(p, p + 1, w[c], "<epsilon>", 0)
            p += 1
            c += 1
        trie.add_arc(p, p + 1, w[c], w, 0)
        p += 1
        biggest = max(p, biggest)
        last_state = trie[biggest]
        last_state.final = True

    det_trie = trie.determinize()
    det_trie.arc_sort_input()
    det_trie.remove_epsilon()

    return det_trie
Example #6
0
    def create_rule_fst(self, rule, feature_weights_dict):
        """
        Create rule FST accepting the sequence of target side tokens.
        :param rule: Rule object
        :param feature_weights_dict: Dictionary of feature names and their weights
        :return: Rule FST
        """

        # Determine whether to use word insertion penalty
        if 'word_insertion_penalty' in feature_weights_dict and not rule.hiero_intersection_rule:
            wip = feature_weights_dict['word_insertion_penalty']
        else:
            wip = None

        # Add arcs representing target tokens one after the other
        rule_fst = fst.Transducer()

        index = -1
        for index, token in enumerate(rule.target_side):
            self.add_arc(rule_fst,
                         index,
                         token,
                         rule.nonterminal_coverages,
                         weight=wip)

        # Compute rule weight in a log linear model
        rule_weight = helpers.loglinear_rule_weight(rule.feature_dict,
                                                    feature_weights_dict)

        # Add the rule weight to the final state in the FST
        rule_fst[index + 1].final = rule_weight

        return rule_fst
Example #7
0
def get_transducer_acceptor(string_):
    transducer_symbol_table = SegmentTable().transducer_symbol_table
    transducer = fst.Transducer(isyms=transducer_symbol_table,
                                osyms=transducer_symbol_table)
    for i, char in enumerate(string_):
        transducer.add_arc(i, i + 1, char, char)
    transducer[i + 1].final = True
    return transducer
Example #8
0
def levenshtein(w, editdst):

    wts = keyweights()

    trie = fst.Transducer()

    letter_syms = fst.read_symbols("ascii.syms.bin")
    trie.isyms = letter_syms
    trie.osyms = letter_syms
    letttup = list(letter_syms.items())
    letters = list()
    for let in letttup:
        letters.append(let[0])

    class StateCounter(object):
        def __init__(self):
            self.set = {}
            self.count = -1

        def __contains__(self, obj):
            return obj in self.set

        def __getitem__(self, obj):
            if not obj in self.set:
                self.count += 1
                self.set[obj] = self.count
            return self.set[obj]

    states = StateCounter()

    for x in range(0, len(w)):
        for y in range(0, editdst + 1):
            trie.add_arc(states[str(x) + "^" + str(y)],
                         states[str(x + 1) + "^" + str(y)], w[x], w[x],
                         0)  # char in word
            if not y == editdst:
                trie.add_arc(states[str(x) + "^" + str(y)],
                             states[str(x + 1) + "^" + str(y + 1)],
                             "<epsilon>", "<epsilon>", 1.5)  # deletion
                for i in letters:
                    trie.add_arc(states[str(x) + "^" + str(y)],
                                 states[str(x + 1) + "^" + str(y + 1)], i, i,
                                 wts[w[x], i])  # substitution
                    trie.add_arc(states[str(x) + "^" + str(y)],
                                 states[str(x) + "^" + str(y + 1)], i, i,
                                 wts[w[x], i])  # insertion

    for y in range(0, editdst + 1):
        trie[states[str(len(w)) + "^" + str(y)]].final = True

    trie.remove_epsilon()
    trie.arc_sort_input()

    return trie
Example #9
0
def gen_word_fst (word,isyms=None,osyms=None):
    "This takes a word and creates a transducer from the letters to the word, introducing symbols into the symbol table as needed"
    wordfst=fst.Transducer(isyms,osyms)
    state=0
    for char in word:
        wordfst.add_arc(state,state+1,char,'ε')
        state=state+1
    wordfst[state].final=True
    for arc in wordfst[state-1].arcs:
        arc.olabel=wordfst.osyms[word]
    return wordfst
Example #10
0
def get_prologue_inverse_transducer():
    transducer_symbol_table = SegmentTable().transducer_symbol_table
    prologue_inverse_transducer = fst.Transducer(isyms=transducer_symbol_table,
                                                 osyms=transducer_symbol_table)
    alphabet = set(SegmentTable().get_segments_symbols())
    for segment in alphabet:
        prologue_inverse_transducer.add_arc(0, 0, segment, segment)
    for bracket in BRACKETS:
        prologue_inverse_transducer.add_arc(0, 0, bracket, EPSILON)
    prologue_inverse_transducer[0].final = True
    return prologue_inverse_transducer
Example #11
0
def create_ipa_fst():
    '''creates fst for converting callhome dictionary pronunciations to arabic'''
    ipa_fst = fst.Transducer()
    fst_file = codecs.open(CALLHOME_FST, 'r', encoding='utf-8')
    for l in fst_file:
        l = l.replace(u'\ufeff', '')
        rule = l.split()
        if len(rule) == 4:
            ipa_fst.add_arc(int(rule[0]), int(rule[1]), rule[2], rule[3])
    ipa_fst[1].final = True
    fst_file.close()
    return ipa_fst
Example #12
0
def get_intro_transducer(sigma, introduced_set):
    sigma_transducer = get_sigma_transducer_for_intro(sigma)

    transducer_symbol_table = SegmentTable().transducer_symbol_table
    cartesian_transducer = fst.Transducer(isyms=transducer_symbol_table,
                                          osyms=transducer_symbol_table)
    for introduced_symbol in introduced_set:
        cartesian_transducer.add_arc(0, 0, EPSILON, introduced_symbol)
    cartesian_transducer[0].final = True
    union_transducer = sigma_transducer | cartesian_transducer
    intro_transducer = union_transducer.closure()
    return intro_transducer
Example #13
0
def genBigGraph(label_prob, symbols, seq_len, label='x'):
    t = fst.Transducer()
    sym = fst.SymbolTable()

    symbols = map(str, symbols)
    x = 0
    for j in range(seq_len):
        for i in range(len(symbols)):
            prob = label_prob[j][i]  #"%.4f" %
            t.add_arc(0 + x, 1 + x, str(label + str(j)), symbols[i],
                      -math.log(prob))
        x += 1
    t[j + 1].final = -1
    return t
Example #14
0
def test_simple():
    t = fst.Transducer()
    for i, (ic, oc) in enumerate(zip('hello', 'olleh')):
        t.add_arc(i, i + 1, ic, oc)
    t[i + 1].final = True
    eq_(len(t), 6)
    ok_(t[5].final)

    a = fst.Acceptor()
    for i, c in enumerate('hello'):
        a.add_arc(i, i + 1, c)
    a[i + 1].final = True
    eq_(len(a), 6)
    ok_(a[5].final)
Example #15
0
def create_dt_fst():
    dt_fst = fst.Transducer(isyms=fst.SymbolTable('eps'),
                            osyms=fst.SymbolTable('eps'))
    fst_file = codecs.open(DUTCH_FST_FILE, 'r', encoding='utf-8')

    for l in fst_file:
        l = l.replace(u'\ufeff', '')
        entry = l.split()
        if len(entry) == 4:
            if entry[3] == 'ks':
                entry[3] = 'k s'
            dt_fst.add_arc(int(entry[0]), int(entry[1]), entry[2], entry[3])

        dt_fst[1].final = True
        dt_fst[2].final = True
    return dt_fst
Example #16
0
    def create_rule_fst(self, rule, feature_weights_dict):
        """
        Create rule FST accepting the sequence of target side tokens.
        :param rule: Rule object
        :param feature_weights_dict: Dictionary of feature names and their weights
        :return: Rule FST
        """

        # Determine whether to use word insertion penalty
        if 'word_insertion_penalty' in feature_weights_dict and not rule.hiero_intersection_rule:
            wip = feature_weights_dict['word_insertion_penalty']
        else:
            wip = None

        # Offset rule_id by rule_id_offset to prevent clashes with Hiero rule id space
        rule_id = rule.id
        if not rule.hiero_intersection_rule:
            rule_id += self.rule_id_offset

        rule_fst = fst.Transducer()

        # Insert rule arc at the start of the transducer (rule_id:epsilon)
        rule_fst.add_arc(0, 1, int(rule_id), 0)

        # Add arcs representing target tokens one after the other
        # Index is adjusted to account for rule arc
        index = 1
        for index, token in enumerate(rule.target_side, 1):
            self.add_arc(rule_fst,
                         index,
                         token,
                         rule.nonterminal_coverages,
                         weight=wip)

        # Compute rule weight in a log linear model
        rule_weight = helpers.loglinear_rule_weight(rule.feature_dict,
                                                    feature_weights_dict)

        # Add the rule weight to the final state in the FST
        rule_fst[index + 1].final = rule_weight

        if rule.hiero_intersection_rule:
            print rule_weight
            print self.fst_tostring(rule_fst)

        return rule_fst
Example #17
0
def pyfst_from_dfa(dfa):
    transducer_symbol_table = SegmentTable().transducer_symbol_table
    transducer = fst.Transducer(isyms=transducer_symbol_table, osyms=transducer_symbol_table)

    dfa_state_transducer_state_dict = {i: i for i, dfa_state in enumerate(dfa.States)}
    for dfa_state1 in dfa.delta:
        for segment in dfa.delta[dfa_state1]:
            dfa_state2 = dfa.delta[dfa_state1][segment]
            transducer_state1 = dfa_state_transducer_state_dict[dfa_state1]
            transducer_state2 = dfa_state_transducer_state_dict[dfa_state2]
            transducer.add_arc(transducer_state1, transducer_state2, segment, segment)


    for dfa_final_state in dfa.Final:
        transducer_final_state = dfa_state_transducer_state_dict[dfa_final_state]
        transducer[transducer_final_state].final = True

    transducer_initial_state = dfa_state_transducer_state_dict[dfa.Initial]
    transducer[transducer_initial_state].initial = True
    return transducer
Example #18
0
def make_edit(sigma):
    """
    Make an edit distance transducer with operations:
    - deletion:     x:<epsilon>/1
    - insertion:    <epsilon>:x/1
    - substitution: x:x/0 and x/y:1
    """
    # Create common symbol table
    syms = fst.SymbolTable()

    # Create transducer
    edit = fst.Transducer(syms, syms)
    edit[0].final = True
    for x in sigma:
        edit.add_arc(0, 0, x, fst.EPSILON, 1)
        edit.add_arc(0, 0, fst.EPSILON, x, 1)
        for y in sigma:
            edit.add_arc(0, 0, x, y, (0 if x == y else 1))

    # Define edit distance
    def distance(a, b):
        # Compose a o edit transducer o b
        composed = fst.linear_chain(a, syms) >> edit >> fst.linear_chain(
            b, syms)
        # Compute distance
        distances = composed.shortest_distance(reverse=True)
        dist = int(distances[0])
        # Find best alignment
        alignment = composed.shortest_path()
        # Re-order states
        alignment.top_sort()
        # Replace <epsilon> -> "-"
        alignment.relabel({fst.EPSILON: '-'}, {fst.EPSILON: '-'})
        # Read alignment on the arcs of the transducer
        arcs = (next(state.arcs) for state in alignment)
        labels = ((arc.ilabel, arc.olabel) for arc in arcs)
        align = [(alignment.isyms.find(x), alignment.osyms.find(y))
                 for x, y in labels]
        return dist, align

    return distance
Example #19
0
    def create_root_fst(label, int_coverage_cells):
        """
        Create a root FST consisting of a single (nonterminal) transition
        :param label: Nonterminal transition label
        :param int_coverage_cells: Dictionary of integer coverages and associated FSTs
        :return: Root FST
        """

        root_fst = fst.Transducer(isyms=fst.SymbolTable(),
                                  osyms=fst.SymbolTable())
        root_fst.osyms[label] = int(label)

        # Adding epsilon input label using symbol table lookup for id=0
        root_fst.add_arc(0, 1, root_fst.isyms.find(0), label)
        root_fst[1].final = True

        # Create root FST symbol table
        for int_coverage, cell_fst in int_coverage_cells.items():
            root_fst.osyms[int_coverage] = int(int_coverage)

        return root_fst
Example #20
0
def gen_utt_graph(labels, symdict):
    t2 = fst.Transducer()
    sym = fst.SymbolTable()
    #3x3 states for this example
    count = 0
    x = 0
    # print labels
    for l in labels:
        symbols = symdict[l]
        symbols = map(str, symbols)
        for i in range(len(symbols)):
            if i == 0:
                t2.add_arc(0 + x, 1 + x, symbols[i],
                           str(l + "/" + "(" + symbols[i] + ")"))
            else:
                t2.add_arc(0 + x, 1 + x, symbols[i],
                           str(sym.find(0) + "(" + symbols[i] + ")"))
            t2.add_arc(1 + x, 1 + x, symbols[i],
                       str(sym.find(0) + "(" + symbols[i] + ")"))

            x += 1

    t2[x].final = True
    return t2
Example #21
0
            B_full_table[dict_tags[tag]][dict_words[word]] = float(
                full_cfd_word_tag[tag][word]) / float(full_num)

full_tag_set.remove('<s>')
full_tag_set.remove('</s>')
full_word_set.remove('<s>')
full_word_set.remove('</s>')

# build the HMM_tagger

import fst
import math

eps = '¦Å'

HMM_tagger = fst.Transducer()

num_temp = num_tags - 2

for tag in full_tag_set:
    HMM_tagger.add_arc(0, dict_tags[tag], eps, eps,
                       -math.log(A_full_table[0][dict_tags[tag]]))

for tag in full_tag_set:
    i = dict_tags[tag]
    for word in full_word_set:
        HMM_tagger.add_arc(i, num_temp + i, word, tag,
                           -math.log(B_full_table[i][dict_words[word]]))

for tag1 in full_tag_set:
    i = dict_tags[tag1]
Example #22
0
if __name__ == '__main__':

    parser = OptionParser()
    parser.add_option("--verbose",
                      action="store_const",
                      const=1,
                      dest="verbose",
                      help="verbose mode")
    (options, args) = parser.parse_args()

    if options.verbose == 1: VERBOSE = 1

    startTime = time.time()

    t1 = fst.Transducer()
    t1.add_arc(0, 1, 'a', 'a')
    t1.add_arc(1, 2, 'b', 'b')
    t1.add_arc(2, 2, 'c', 'c')
    t1[2].final = True

    t2 = fst.Transducer()
    t2.add_arc(0, 1, 'c', 'c')
    t2.add_arc(1, 2, 'b', 'b')
    t2.add_arc(2, 3, 'a', 'a')
    t2[3].final = True
    '''
	while 1 :
		try : line = sys.stdin.readline()
		except KeyboardInterrupt : break
		if not line : break
Example #23
0
    def create_empty_fst():
        empty_fst = fst.Transducer()
        empty_fst.add_arc(0, 1, 0, 0)
        empty_fst[1].final = True

        return empty_fst