Beispiel #1
0
    def topk_choice(self, word_sequence, topk_wds=None):
        '''
        extracts the topk choices of
        lm given a word history (lattice)
        input: lm.fst and sentence string
        output: topk words to complete the lattice
        '''

        # generate sentence fst
        fstout = fst.intersect(word_sequence, self.lm)
        fst_det = fst.determinize(fstout)
        fst_p = fst.push(fst_det, push_weights=True, to_final=True)
        fst_p.rmepsilon()
        fst_rm = fst.determinize(fst_p)
        short = fst.shortestpath(fst_rm, nshortest=10)
        short_det = fst.determinize(short)
        short_det.rmepsilon()
        two_state = fst.compose(short_det, self.refiner)
        output = two_state.project(project_output=True)
        output.rmepsilon()
        output = fst.determinize(output)
        output.minimize()
        if topk_wds is not None:  # Needs to distinguish None and []
            topk_wds.extend(self.get_topk_words(output))
        return output
Beispiel #2
0
def alternatives(sequence):
    # sequence is a list of words
    # this function produces the n_best IV alternatives to sequence by replacing graphemes
    filename = index_name + '_' + '_'.join(sequence)
    filename = filename.replace("'", "")
    file_path = 'FSTs/compositions/' + filename + '.pd'

    if rerun:
        # Build FST
        compiler_sequence = fst.Compiler(isymbols=printable_ST,
                                         osymbols=printable_ST,
                                         keep_isymbols=True,
                                         keep_osymbols=True)
        c = 0
        for word in sequence:
            for char in word:
                print >> compiler_sequence, str(c) + ' ' + str(
                    c + 1) + ' ' + char + ' ' + char
                c = c + 1
            print >> compiler_sequence, str(c) + ' ' + str(c +
                                                           1) + ' </w> </w>'
            c = c + 1
        print >> compiler_sequence, str(c)
        fst_sequence = compiler_sequence.compile()

        # composition occurs in tropical semiring
        # composition = fst.compose(fst_vocab, fst.compose(grapheme_confusion, fst_sequence)).rmepsilon().arcsort()

        composition = fst.intersect(
            fst.compose(fst_sequence,
                        error_maker_fst).project(project_output=True),
            fst_vocab).rmepsilon().arcsort()
        # composition = fst.compose(fst_sequence, compos_graph_vocab).rmepsilon().arcsort()

        # composition = fst.compose(fst.compose(fst_sequence, error_maker_fst).project(project_output=True).rmepsilon(), fst_vocab).rmepsilon().arcsort()
        # composition = fst.compose(fst_sequence, tmp_compos).project(project_output=True).rmepsilon().arcsort()
        # composition = fst.compose(fst_sequence, error_maker_fst).project(project_output=True).rmepsilon().arcsort()
        alters = printstrings(composition,
                              nshortest=n_best,
                              syms=printable_ST,
                              weight=True,
                              project_output=False)
        scores = []
        if alters:
            scores = [math.exp(-float(alt[1])) for alt in alters]
            alters = [alt[0].split(' </w>')[:-1] for alt in alters]
            alters = [[''.join(word.split(' ')) for word in alt]
                      for alt in alters]
            # print query, alters
        # print alters
        pickle.dump({'alters': alters, 'scores': scores}, open(file_path, 'w'))
    else:
        d = pickle.load(open(file_path))
        alters = d['alters']
        scores = d['scores']
        alters = alters[0:n_best]
        scores = scores[0:n_best]

    return alters, scores
Beispiel #3
0
 def intersect(self, other):
     """Constructs an unminimized DFA recognizing
     the intersection of the languages of two given DFAs.
     Args:
         other (DFA): The other DFA that will be used
                      for the intersect operation
     Returns:
     Returns:
         DFA: The resulting DFA
     """
     self.automaton = fst.intersect(self.automaton, other.automaton)
     return self
Beispiel #4
0
    def get_prior(self):
        '''
        set an array with priors
        in future priors are given from rsvp EEG vector

        OUTPUTS:
            an array of tuples, which consists of the character and the
            corresponding probabilities.
        '''
        sigma_h = self.create_machine_history()
        print(sigma_h)
        # intersect
        sigma_h.arcsort(sort_type="olabel")
        output_dist = fst.intersect(sigma_h, self.lm)
        print(output_dist)
        # process result
        output_dist = output_dist.rmepsilon()
        #output_dist = fst.rmepsilon(output_dist)
        output_dist = fst.determinize(output_dist)
        output_dist.minimize()
        output_dist = fst.push(output_dist, push_weights=True, to_final=True)

        # worth converting this history to np.array if vector computations
        # will be involeved
        #output_dist.arcsort(sort_type="olabel")

        # traverses the shortest path until we get to the second to
        # last state. And the arcs from that state to the final state contain
        # the distribution that we want.
        prev_stateid = curr_stateid = None
        for state in output_dist.states():
            if not curr_stateid is None:
                prev_stateid = curr_stateid
            curr_stateid = state
        priors = []
        for arc in output_dist.arcs(prev_stateid):
            ch = self.lm_syms.find(
                arc.ilabel)  #ilabel and olabel are the same.
            w = float(arc.weight)

            # TODO: for this demo we only need distribution over the characters
            # from 'a' to 'z'
            if len(ch) == 1 and ch in self.legit_ch_dict:
                priors.append((ch, w))

        # assuming the EEG input is an array like [("a", 0.3),("b", 0.2),...]
        # sort the array depending on the probability
        priors = sorted(priors, key=lambda prior: prior[1])
        normalized_dist = self._normalize([prob for _, prob in priors])
        return zip([ch for ch, _ in priors], normalized_dist)
Beispiel #5
0
    def next_char_dist(self, history, char_lm):
        '''
        Get the distribution of next character.
        '''
        history = self.concat_alphabet(history)
        history.arcsort(sort_type="olabel")
        output = fst.intersect(history, char_lm)
        output.rmepsilon()
        output = fst.determinize(output)
        output.minimize()

        # reads an fst to combine the weights of the next character.
        last_ltr = fst.compose(output, self.ltr_dist)
        last_ltr.project(True)
        last_ltr.push(to_final=True)
        last_ltr.rmepsilon()
        last_ltr = fst.determinize(last_ltr)
        last_ltr.minimize()

        # Extracts priors. Although it's a two-state machine, we have the
        # generic traverse procedure here just in case.
        prev_stateid = curr_stateid = None
        for state in last_ltr.states():
            if not curr_stateid is None:
                prev_stateid = curr_stateid
            curr_stateid = state
        priors = []
        syms = last_ltr.input_symbols()
        for arc in last_ltr.arcs(prev_stateid):
            ch = syms.find(arc.ilabel)
            w = float(arc.weight)
            if len(ch) == 1:
                priors.append((ch, w))

        # Sorts the prior by the probability and normalize it.
        priors = sorted(priors, key=lambda prior: prior[1])
        priors_vals = [BitWeight(prob) for _,prob in priors]
        total = sum(priors_vals, BitWeight(1e6))
        norm_priors = [(prob / total).loge() for prob in priors_vals]
        return zip([ch for ch,_ in priors], norm_priors)
sigfst = fst.Fst.read("sigma.fst")
sigout = fst.Fst.read("sigout.fst")
rhofst = fst.Fst.read("rho.fst")
rhoout = fst.Fst.read("rhoout.fst")
phifst = fst.Fst.read("phi.fst")
phiout = fst.Fst.read("phiout.fst")

sigma_label = 5
rho_label = 6
phi_label = 7

rewrite_mode = "always"

phi_self_loop = True

sigfst = specializer.sigma(sigfst, sigma_label, rewrite_mode).get()
rhofst = specializer.rho(rhofst, rho_label, rewrite_mode).get()
phifst = specializer.phi(phifst, phi_label, rewrite_mode, phi_self_loop).get()

print "Orignial Fst"
print one
print "intersection with the sigma machine"
print fst.intersect(sigfst, one)
print fst.equal(fst.intersect(sigfst, one), sigout)
print "intersection with the rho machine"
print fst.intersect(rhofst, one)
print fst.equal(fst.intersect(rhofst, one), rhoout)
print "intersection with the phi machine"
print fst.intersect(phifst, one)
print fst.equal(fst.intersect(phifst, one), phiout)