def topk_choice(self, word_sequence, topk_wds=None): ''' extracts the topk choices of lm given a word history (lattice) input: lm.fst and sentence string output: topk words to complete the lattice ''' # generate sentence fst fstout = fst.intersect(word_sequence, self.lm) fst_det = fst.determinize(fstout) fst_p = fst.push(fst_det, push_weights=True, to_final=True) fst_p.rmepsilon() fst_rm = fst.determinize(fst_p) short = fst.shortestpath(fst_rm, nshortest=10) short_det = fst.determinize(short) short_det.rmepsilon() two_state = fst.compose(short_det, self.refiner) output = two_state.project(project_output=True) output.rmepsilon() output = fst.determinize(output) output.minimize() if topk_wds is not None: # Needs to distinguish None and [] topk_wds.extend(self.get_topk_words(output)) return output
def alternatives(sequence): # sequence is a list of words # this function produces the n_best IV alternatives to sequence by replacing graphemes filename = index_name + '_' + '_'.join(sequence) filename = filename.replace("'", "") file_path = 'FSTs/compositions/' + filename + '.pd' if rerun: # Build FST compiler_sequence = fst.Compiler(isymbols=printable_ST, osymbols=printable_ST, keep_isymbols=True, keep_osymbols=True) c = 0 for word in sequence: for char in word: print >> compiler_sequence, str(c) + ' ' + str( c + 1) + ' ' + char + ' ' + char c = c + 1 print >> compiler_sequence, str(c) + ' ' + str(c + 1) + ' </w> </w>' c = c + 1 print >> compiler_sequence, str(c) fst_sequence = compiler_sequence.compile() # composition occurs in tropical semiring # composition = fst.compose(fst_vocab, fst.compose(grapheme_confusion, fst_sequence)).rmepsilon().arcsort() composition = fst.intersect( fst.compose(fst_sequence, error_maker_fst).project(project_output=True), fst_vocab).rmepsilon().arcsort() # composition = fst.compose(fst_sequence, compos_graph_vocab).rmepsilon().arcsort() # composition = fst.compose(fst.compose(fst_sequence, error_maker_fst).project(project_output=True).rmepsilon(), fst_vocab).rmepsilon().arcsort() # composition = fst.compose(fst_sequence, tmp_compos).project(project_output=True).rmepsilon().arcsort() # composition = fst.compose(fst_sequence, error_maker_fst).project(project_output=True).rmepsilon().arcsort() alters = printstrings(composition, nshortest=n_best, syms=printable_ST, weight=True, project_output=False) scores = [] if alters: scores = [math.exp(-float(alt[1])) for alt in alters] alters = [alt[0].split(' </w>')[:-1] for alt in alters] alters = [[''.join(word.split(' ')) for word in alt] for alt in alters] # print query, alters # print alters pickle.dump({'alters': alters, 'scores': scores}, open(file_path, 'w')) else: d = pickle.load(open(file_path)) alters = d['alters'] scores = d['scores'] alters = alters[0:n_best] scores = scores[0:n_best] return alters, scores
def intersect(self, other): """Constructs an unminimized DFA recognizing the intersection of the languages of two given DFAs. Args: other (DFA): The other DFA that will be used for the intersect operation Returns: Returns: DFA: The resulting DFA """ self.automaton = fst.intersect(self.automaton, other.automaton) return self
def get_prior(self): ''' set an array with priors in future priors are given from rsvp EEG vector OUTPUTS: an array of tuples, which consists of the character and the corresponding probabilities. ''' sigma_h = self.create_machine_history() print(sigma_h) # intersect sigma_h.arcsort(sort_type="olabel") output_dist = fst.intersect(sigma_h, self.lm) print(output_dist) # process result output_dist = output_dist.rmepsilon() #output_dist = fst.rmepsilon(output_dist) output_dist = fst.determinize(output_dist) output_dist.minimize() output_dist = fst.push(output_dist, push_weights=True, to_final=True) # worth converting this history to np.array if vector computations # will be involeved #output_dist.arcsort(sort_type="olabel") # traverses the shortest path until we get to the second to # last state. And the arcs from that state to the final state contain # the distribution that we want. prev_stateid = curr_stateid = None for state in output_dist.states(): if not curr_stateid is None: prev_stateid = curr_stateid curr_stateid = state priors = [] for arc in output_dist.arcs(prev_stateid): ch = self.lm_syms.find( arc.ilabel) #ilabel and olabel are the same. w = float(arc.weight) # TODO: for this demo we only need distribution over the characters # from 'a' to 'z' if len(ch) == 1 and ch in self.legit_ch_dict: priors.append((ch, w)) # assuming the EEG input is an array like [("a", 0.3),("b", 0.2),...] # sort the array depending on the probability priors = sorted(priors, key=lambda prior: prior[1]) normalized_dist = self._normalize([prob for _, prob in priors]) return zip([ch for ch, _ in priors], normalized_dist)
def next_char_dist(self, history, char_lm): ''' Get the distribution of next character. ''' history = self.concat_alphabet(history) history.arcsort(sort_type="olabel") output = fst.intersect(history, char_lm) output.rmepsilon() output = fst.determinize(output) output.minimize() # reads an fst to combine the weights of the next character. last_ltr = fst.compose(output, self.ltr_dist) last_ltr.project(True) last_ltr.push(to_final=True) last_ltr.rmepsilon() last_ltr = fst.determinize(last_ltr) last_ltr.minimize() # Extracts priors. Although it's a two-state machine, we have the # generic traverse procedure here just in case. prev_stateid = curr_stateid = None for state in last_ltr.states(): if not curr_stateid is None: prev_stateid = curr_stateid curr_stateid = state priors = [] syms = last_ltr.input_symbols() for arc in last_ltr.arcs(prev_stateid): ch = syms.find(arc.ilabel) w = float(arc.weight) if len(ch) == 1: priors.append((ch, w)) # Sorts the prior by the probability and normalize it. priors = sorted(priors, key=lambda prior: prior[1]) priors_vals = [BitWeight(prob) for _,prob in priors] total = sum(priors_vals, BitWeight(1e6)) norm_priors = [(prob / total).loge() for prob in priors_vals] return zip([ch for ch,_ in priors], norm_priors)
sigfst = fst.Fst.read("sigma.fst") sigout = fst.Fst.read("sigout.fst") rhofst = fst.Fst.read("rho.fst") rhoout = fst.Fst.read("rhoout.fst") phifst = fst.Fst.read("phi.fst") phiout = fst.Fst.read("phiout.fst") sigma_label = 5 rho_label = 6 phi_label = 7 rewrite_mode = "always" phi_self_loop = True sigfst = specializer.sigma(sigfst, sigma_label, rewrite_mode).get() rhofst = specializer.rho(rhofst, rho_label, rewrite_mode).get() phifst = specializer.phi(phifst, phi_label, rewrite_mode, phi_self_loop).get() print "Orignial Fst" print one print "intersection with the sigma machine" print fst.intersect(sigfst, one) print fst.equal(fst.intersect(sigfst, one), sigout) print "intersection with the rho machine" print fst.intersect(rhofst, one) print fst.equal(fst.intersect(rhofst, one), rhoout) print "intersection with the phi machine" print fst.intersect(phifst, one) print fst.equal(fst.intersect(phifst, one), phiout)