def processLattices(lats_sets,folders,statePruneTh=10000,pruneTh=10,silence=False): '''Applies standard pre-processing opperations to SMT lattices @lats_sets: lattices to be processed @folders: output folders for processed lattices @statePruneTh: fsts above this threshold are pruned @pruneTh: pruning threshold @silence: if True, then the function does not print which lattice is being processed''' for lats_set,folder in zip(lats_sets,folders): print lats_set print folder for f in sorted(glob.glob(lats_set),key=numericalSort): lattice = fst.Fst.read(f) if lattice.num_states() > statePruneTh: # detminpush = fst.push(fst.arcmap(fst.determinize(lattice.rmepsilon()).minimize(),map_type="to_log"),push_weights=True) detminpush = fst.push(fst.arcmap(fst.determinize(lattice.rmepsilon()).minimize(),map_type="to_log"),push_weights=True) out = fst.arcmap(fst.push(fst.arcmap(fst.prune(fst.arcmap(detminpush,map_type="to_standard"),weight=pruneTh).minimize(),map_type="to_log"),push_weights=True),map_type="to_standard") out.write(folder+os.path.basename(f)) if not silence: print os.path.basename(f) else: # detminpush = fst.push(fst.determinize(fst.arcmap(lattice.rmepsilon(),map_type="to_log")).minimize(),push_weights=True) detminpush = fst.push(fst.arcmap(fst.determinize(lattice.rmepsilon()).minimize(),map_type="to_log"),push_weights=True) out = fst.arcmap(detminpush,map_type="to_standard") out.write(folder+os.path.basename(f)) if not silence: print os.path.basename(f)
def word_sequence_history(self, eeg_saus): ''' generate a probable word sequence given the EEG samples by intersecting it with word language model ''' word_seq = fst.compose(eeg_saus, self.ltr2wrd) fst.push(word_seq, push_weights=True, to_final=True) word_seq.project(project_output=True) word_seq.rmepsilon() return word_seq
def samplePathFromFst(fst_lattice, id2label): """Sample path from a lattice. Parameters ---------- fst_lattice : fst.Fst Lattice in OpenFst format. id2labels : mapping arc label id to human readable labels. Returns ------- path : list Sequence of (human readable) labels. """ # Make the import here only as some people may not have openfst installed. import pywrapfst as fst # Transform fst_lattice into a stochastic FST. stoc_fst_lattice = fst.push(fst_lattice, push_weights=True, remove_total_weight=True) # Random walk on the stochastic FST. path = [] __walkFst(stoc_fst_lattice, stoc_fst_lattice.start, id2label, path) return path
def topk_choice(self, word_sequence, topk_wds=None): ''' extracts the topk choices of lm given a word history (lattice) input: lm.fst and sentence string output: topk words to complete the lattice ''' # generate sentence fst fstout = fst.intersect(word_sequence, self.lm) fst_det = fst.determinize(fstout) fst_p = fst.push(fst_det, push_weights=True, to_final=True) fst_p.rmepsilon() fst_rm = fst.determinize(fst_p) short = fst.shortestpath(fst_rm, nshortest=10) short_det = fst.determinize(short) short_det.rmepsilon() two_state = fst.compose(short_det, self.refiner) output = two_state.project(project_output=True) output.rmepsilon() output = fst.determinize(output) output.minimize() if topk_wds is not None: # Needs to distinguish None and [] topk_wds.extend(self.get_topk_words(output)) return output
def fst_finalize(c, last_node, eos_node, path): fst_arc(c, last_node, eos_node, args.eos_id) c.write("%d\n" % eos_node) f = c.compile() f.rmepsilon() f = fst.determinize(f) f.minimize() f.topsort() f = fst.push(f, push_weights=True) f.write(path)
def get_prior(self): ''' set an array with priors in future priors are given from rsvp EEG vector OUTPUTS: an array of tuples, which consists of the character and the corresponding probabilities. ''' sigma_h = self.create_machine_history() print(sigma_h) # intersect sigma_h.arcsort(sort_type="olabel") output_dist = fst.intersect(sigma_h, self.lm) print(output_dist) # process result output_dist = output_dist.rmepsilon() #output_dist = fst.rmepsilon(output_dist) output_dist = fst.determinize(output_dist) output_dist.minimize() output_dist = fst.push(output_dist, push_weights=True, to_final=True) # worth converting this history to np.array if vector computations # will be involeved #output_dist.arcsort(sort_type="olabel") # traverses the shortest path until we get to the second to # last state. And the arcs from that state to the final state contain # the distribution that we want. prev_stateid = curr_stateid = None for state in output_dist.states(): if not curr_stateid is None: prev_stateid = curr_stateid curr_stateid = state priors = [] for arc in output_dist.arcs(prev_stateid): ch = self.lm_syms.find( arc.ilabel) #ilabel and olabel are the same. w = float(arc.weight) # TODO: for this demo we only need distribution over the characters # from 'a' to 'z' if len(ch) == 1 and ch in self.legit_ch_dict: priors.append((ch, w)) # assuming the EEG input is an array like [("a", 0.3),("b", 0.2),...] # sort the array depending on the probability priors = sorted(priors, key=lambda prior: prior[1]) normalized_dist = self._normalize([prob for _, prob in priors]) return zip([ch for ch, _ in priors], normalized_dist)
def printstrings(a, nshortest=1, project_output=False, syms=None, weight=False): """ Return the nshortest unique input strings in the FST a. The FST a is projected onto the input or output prior to finding the shortest paths. An optional symbol table syms can be provided. Results are returned as strings; if the weight flag is specified, the path scores are included """ import pywrapfst as fst b = a.copy().project(project_output=project_output) if nshortest == 1: c = fst.shortestpath(b) else: c = fst.shortestpath(b, nshortest=nshortest, unique=True) nba = fst.push(c, push_weights=True).rmepsilon() nb = [] if nba.start() != -1: for arc1 in nba.arcs(nba.start()): w = arc1.weight nextstate = arc1.nextstate nbi = [] if syms: nbi.append(syms.find(arc1.ilabel)) else: nbi.append(str(arc1.ilabel)) while nba.arcs(nextstate): try: nextarc = nba.arcs(nextstate).next() except StopIteration: break if syms: nbi.append(syms.find(nextarc.ilabel)) else: nbi.append(str(nextarc.ilabel)) nextstate = nextarc.nextstate if weight: nb.append((' '.join(nbi), w.to_string())) else: nb.append(' '.join(nbi)) return nb