def _readMaps(cls, fsm_fn, encoding='utf-8'): imap_fn = os.path.splitext(fsm_fn)[0] + '.isym' imap = SymMap.readFromFile(imap_fn, encoding=encoding) omap_fn = os.path.splitext(fsm_fn)[0] + '.osym' omap = SymMap.readFromFile(omap_fn, encoding=encoding) return imap, omap
def _readMaps(cls, fsm_fn, encoding='utf-8'): imap_fn = os.path.splitext(fsm_fn)[0]+'.isym' imap = SymMap.readFromFile(imap_fn, encoding=encoding) omap_fn = os.path.splitext(fsm_fn)[0]+'.osym' omap = SymMap.readFromFile(omap_fn, encoding=encoding) return imap, omap
def genPadder(self): symbols = [] maps2 = [] count = 1 _pop_ = 0 if self.pteMap: pte_symbols = sorted(self.pteMap.values()) pte_map = SymMap() pte_map2 = SymMap() for key, value in sorted(self.pteMap.items()): pte_map[value] = value+count pte_map2[key] = value+count _pop_ = max(_pop_, value+count) count += len(pte_map) else: pte_symbols = [] pte_map = {} pte_map2 = {} for map in self.symbolMaps: if map is None: map = {} symbols.append(sorted(map.values())) new_map = SymMap() new_map2 = SymMap() for key, value in sorted(map.items()): new_map[value] = value+count new_map2[key] = value+count _pop_ = max(_pop_, value+count) count += len(new_map) maps2.append(new_map2) _pop_ += 1 n_sets = sum(1 for i in maps2 if len(i)!=0) p_sets = 0 end_state = 0 state = 1 yield end_state, state, 0, _pop_ for key, value in sorted(pte_map.items()): yield state, state, value, value for map in maps2: if len(map) == 0: continue p_sets += 1 if p_sets == n_sets: new_state = end_state else: new_state = state + 1 for key, value in sorted(map.items()): yield state, new_state, value, value state += 1 self._pop_ = _pop_
def main(self, concept_map, sym_map, examples, output, threshold): self.conceptMap = SymMap.readFromFile(concept_map, format=(int, unicode)).inverse self.symMap = SymMap.readFromFile(sym_map, format=(int, unicode)).inverse examples = ADict.readFromFile(examples) examples = self.mapExamples(examples, threshold) key = lambda (k, v): (k[0], -v, k[1]) examples.writeToFile(output, key=key)
def genRepeater(self): symbols = [] maps2 = [] count = 1 if self.pteMap: pte_symbols = sorted(self.pteMap.values()) pte_map = SymMap() pte_map2 = SymMap() for key, value in sorted(self.pteMap.items()): pte_map[value] = value+count pte_map2[key] = value+count count += len(pte_map) else: pte_symbols = [] pte_map = {} pte_map2 = {} for map in self.symbolMaps: if map is None: map = {} symbols.append(sorted(map.values())) new_map = SymMap() new_map2 = SymMap() for key, value in sorted(map.items()): new_map[value] = value+count new_map2[key] = value+count count += len(new_map) maps2.append(new_map2) end_state = state = 0 state_map = SymMap() if pte_map2: for value in pte_map2.values(): state += 1 yield end_state, state, value, value yield state, state, 0, value for map in maps2: for sym in map.values(): yield state, state, sym, sym yield state, end_state, 0, 0 else: for map in maps2: for sym in map.values(): yield state, state, sym, sym
def genPadder(self): symbols = [] maps2 = [] count = 1 _pop_ = 0 if self.pteMap: pte_symbols = sorted(self.pteMap.values()) pte_map = SymMap() pte_map2 = SymMap() for key, value in sorted(self.pteMap.items()): pte_map[value] = value + count pte_map2[key] = value + count _pop_ = max(_pop_, value + count) count += len(pte_map) else: pte_symbols = [] pte_map = {} pte_map2 = {} for map in self.symbolMaps: if map is None: map = {} symbols.append(sorted(map.values())) new_map = SymMap() new_map2 = SymMap() for key, value in sorted(map.items()): new_map[value] = value + count new_map2[key] = value + count _pop_ = max(_pop_, value + count) count += len(new_map) maps2.append(new_map2) _pop_ += 1 n_sets = sum(1 for i in maps2 if len(i) != 0) p_sets = 0 end_state = 0 state = 1 yield end_state, state, 0, _pop_ for key, value in sorted(pte_map.items()): yield state, state, value, value for map in maps2: if len(map) == 0: continue p_sets += 1 if p_sets == n_sets: new_state = end_state else: new_state = state + 1 for key, value in sorted(map.items()): yield state, new_state, value, value state += 1 self._pop_ = _pop_
def genRepeater(self): symbols = [] maps2 = [] count = 1 if self.pteMap: pte_symbols = sorted(self.pteMap.values()) pte_map = SymMap() pte_map2 = SymMap() for key, value in sorted(self.pteMap.items()): pte_map[value] = value + count pte_map2[key] = value + count count += len(pte_map) else: pte_symbols = [] pte_map = {} pte_map2 = {} for map in self.symbolMaps: if map is None: map = {} symbols.append(sorted(map.values())) new_map = SymMap() new_map2 = SymMap() for key, value in sorted(map.items()): new_map[value] = value + count new_map2[key] = value + count count += len(new_map) maps2.append(new_map2) end_state = state = 0 state_map = SymMap() if pte_map2: for value in pte_map2.values(): state += 1 yield end_state, state, value, value yield state, state, 0, value for map in maps2: for sym in map.values(): yield state, state, sym, sym yield state, end_state, 0, 0 else: for map in maps2: for sym in map.values(): yield state, state, sym, sym
def _emptyMaps(cls): imap = SymMap() imap[cls.eps] = 0 omap = SymMap() omap[cls.eps] = 0 return imap, omap
def main( self, model_dir, encoding=None, batch=False, omit_leaves=False, mlf=False, xml_dir=None, ref_mlf=None, skip_empty=False, input_chain=None, batch_size=100, no_underscores=True, force_pdt=False, pdt_dir=None, ): encoding = sys.stdout.encoding if encoding is None: if os.name == "nt": encoding = "cp1250" else: encoding = "iso-8859-2" datasets_fn = pjoin(model_dir, "datasets") datasets_fr = file(datasets_fn, "r") datasets = [] isymMaps = [] for i, line in enumerate(datasets_fr): line = line.strip() datasets.append(line) if line != "off": isymMaps.append(SymMap.readFromFile(pjoin(model_dir, "isym%d.map" % (i + 1,)))) osymMap = SymMap.readFromFile(pjoin(model_dir, "osym.map")) if "signed" in datasets: da_type = "signed" else: da_type = "normalized" if not pdt_dir: pdt_dir = "/opt/PDT-2.0/tools/machine-annotation" if xml_dir: reader = input.MultiReader([xml_dir], input.DXMLReader) if force_pdt and "lemma" in datasets or "pos" in datasets: if os.name == "nt": raise RuntimeError("Datasets 'lemma' and 'pos' are unsupported on Windows") reader = input.PDTReader(pdt_dir, reader, online=not batch) else: reader = input.StdInReader(encoding=encoding, type=da_type) if "lemma" in datasets or "pos" in datasets: if os.name == "nt": raise RuntimeError("Datasets 'lemma' and 'pos' are unsupported on Windows") reader = input.PDTReader(pdt_dir, reader, online=not batch) if input_chain is not None: reader = input.InputChain(input_chain, reader) generator = input.InputGenerator(reader, datasets, datasets[0], noUnderscores=no_underscores) hypMLF = MLF() refMLF = MLF() if not batch: for da_fn, da_id, da_semantics, da_txts in generator.readInputs(): da_empty = not bool(da_semantics.strip()) if da_empty and skip_empty: continue refMLF[da_id] = da_semantics + "\n" dcd = self.parseLine(model_dir, [da_txts], isymMaps, osymMap, omitLeaves=omit_leaves) if dcd: if len(dcd) == 1: hypMLF[da_id] = dcd[0].encode(encoding) + "\n" else: hypMLF[da_id] = ";".join(dcd).encode(encoding) + "\n" else: hypMLF[da_id] = line + "\n" if not mlf: print hypMLF[da_id], else: all_processed = False inputs = generator.readInputs() while not all_processed: da_count = 0 lines = [] ids = [] for da_fn, da_id, da_semantics, da_txts in inputs: da_empty = not bool(da_semantics.strip()) if da_empty and skip_empty: continue refMLF[da_id] = da_semantics + "\n" lines.append(da_txts) ids.append(da_id) da_count += 1 if da_count >= batch_size: break else: all_processed = True dcd = self.parseLine(model_dir, lines, isymMaps, osymMap, omitLeaves=omit_leaves) for da_id, ol in zip(ids, dcd): hypMLF[da_id] = ol.encode(encoding) + "\n" if not mlf: print hypMLF[da_id], if mlf: s = "".join(hypMLF.toLines()) print s if ref_mlf: refMLF.writeToFile(ref_mlf)
def fsmconvert(self, pteMapFn=None): sys.path.append('src') import fsm from svc.ui import gmtk max_states = int(self.settings['FSM_STATES']) cutoff_sym = float(self.settings['FSM_CUTOFF_SYM']) cutoff_trans = float(self.settings['FSM_CUTOFF_TRANS']) self.setCommonParams() FSM_DIR = self.settings['FSM_DIR'] mkdirp(FSM_DIR) conceptMapFn = self.settings['CONCEPT_MAP'] self.logger.debug("Reading concept map: %s", conceptMapFn) conceptMap = SymMap.readFromFile(conceptMapFn, format=(int, unicode)).inverse del conceptMap['_SINK_'] #conceptMap = SymMap((k, v) for (k, v) in conceptMap.iteritems() if k in '_EMPTY_ GREETING DEPARTURE'.split()) #conceptMap = SymMap((k, v) for (k, v) in conceptMap.iteritems() if k in '_EMPTY_ GREETING'.split()) dataset_fn = os.path.join(FSM_DIR, 'datasets') dataset_fw = file(dataset_fn, 'w') sMaps = [] for ds in [1, 2, 3]: ds_value = self.settings['S%d_DATASET' % ds] if ds_value != 'off': mapFn = self.settings['S%d_MAP' % ds] self.logger.debug("Reading s%d map: %s", ds, mapFn) map = SymMap.readFromFile(mapFn, format=(int, unicode)).inverse #map = SymMap((k, v) for (k, v) in map.iteritems() if k in u'dobrý den kdy jede _empty_ _unseen_'.split()) sMaps.append(map) else: self.logger.debug("Dataset s%d is turned off", ds) sMaps.append(None) dataset_fw.write(ds_value + '\n') dataset_fw.close() if pteMapFn is not None: self.logger.debug("Reading pte map: %s", pteMapFn) pteMap = SymMap.readFromFile(pteMapFn, format=(unicode, int)) else: pteMap = {} pteSymbols = pteMap.keys() mstr = os.path.join(self.settings['MSTR_DCD_DIR'], 'in.mstr') cppOptions = self.settings['CPP_OPTIONS'].split() workspace = gmtk.Workspace(cppOptions=cppOptions, readDTS=False) self.logger.info('Reading master file: %s', mstr) workspace.readMasterFile(mstr) self.logger.info('Creating FSM from arcs') self.logger.info('Total number of concepts: %d', len(conceptMap)) #self.logger.info('Total number of symbols: %d', len(s1Map)) stateGenerator = fsm.FSMGenerator(workspace, conceptMap, sMaps, cutoff_sym, cutoff_trans, max_states, pteSymbols=pteSymbols, logger=self.logger) stateGenerator.writeFSMRepeater( os.path.join(FSM_DIR, 'hvsrepeater.txt')) stateGenerator.writeFSMPadder(os.path.join(FSM_DIR, 'hvspadder.txt')) stateGenerator.writeFSM(os.path.join(FSM_DIR, 'hvsparser_pad.txt')) stateGenerator.stateMap.writeToFile(os.path.join(FSM_DIR, 'state.map')) stateGenerator.osymMap.writeToFile(os.path.join(FSM_DIR, 'osym.map')) for i, map in enumerate(stateGenerator.isymMaps): map.writeToFile(os.path.join(FSM_DIR, 'isym%d.map' % (i + 1, ))) stateGenerator.ipteMap.writeToFile(os.path.join(FSM_DIR, 'pte.map')) self.fsmcompile()
def createPTESymbolMap(self, pteSymbols): 'pteSymbols - Pass-through-empty symbols' ret = SymMap() for i, sym in enumerate(pteSymbols): ret[sym] = i return ret
def setupMaps(self, conceptMap, symMap): self.conceptMap = SymMap.readFromFile(conceptMap, format=(int, unicode)).inverse self.symMap = SymMap.readFromFile(symMap, format=(int, unicode)).inverse
def makeFMStxt(self, separ, dataLm): isym_map = None isym_fn = os.path.join(dataLm, 'dacoder.fsm.isym') fsm = os.path.join(dataLm, 'dacoder.fsm.txt') fsm_fw = file(fsm, 'w') add = 1 da_map = SymMap.readFromFile( os.path.join(dataLm, 'dialogue_act.fsm.isym')) for da in separ: fn = self.mapTXT(dataLm, da) fn_lm = self.mapLM(dataLm, da) fn_fsm = self.mapFSM(dataLm, da) da_num_op = da_map['operator_' + da] da_num_us = da_map['user_' + da] if isym_map is None: self.convertLMtoFSM(fn_lm, isym_fn) isym_map = SymMap.readFromFile(isym_fn) _empty_ = isym_map.add('_empty_') _operator_ = isym_map.add('_operator_') _user_ = isym_map.add('_user_') for i in separ: isym_map.add('user_%s' % i) isym_map.add('operator_%s' % i) isym_map.writeToFile(isym_fn) s0 = None states = set() for line in self.convertLMtoFSM(fn_lm): # GAWK hack line = line.replace(',', '.') splitted = line.split() if s0 is None: s0 = int(splitted[0]) + add print >> fsm_fw, '0\t%d\t%d\t%d\t0' % ( s0, _operator_, da_num_op, ) print >> fsm_fw, '0\t%d\t%d\t%d\t0' % ( s0, _user_, da_num_us, ) if len(splitted) in (1, 2): state_no = int(splitted[0]) if len(splitted) == 2: weight = float(splitted[1]) else: weight = 0. print >> fsm_fw, '%d\t0\t%d\t0\t%e' % (state_no + add, _empty_, weight) states.add(state_no) elif len(splitted) in (3, 4): state_no_1 = int(splitted[0]) state_no_2 = int(splitted[1]) isym = int(splitted[2]) if len(splitted) == 4: weight = float(splitted[3]) else: weight = 0. print >> fsm_fw, '%d\t%d\t%d\t0\t%e' % ( state_no_1 + add, state_no_2 + add, isym, weight) states.add(state_no_1) states.add(state_no_2) else: raise ValueError("Unknown FSM line: %r" % line) add += max(states) + 1 for i in separ: for j in ['user', 'operator']: da = '%s_%s' % (j, i) isym = isym_map[da] osym = da_map[da] print >> fsm_fw, '0\t0\t%d\t%d\t0' % (isym, osym) print >> fsm_fw, '0' fsm_fw.close() da_map.writeToFile(os.path.join(dataLm, 'dacoder.fsm.osym')) FSMCompile('-t', fsm, '-F', os.path.join(dataLm, 'dacoder.fsm'))
def main(self, model_dir, encoding=None, batch=False, omit_leaves=False, mlf=False, xml_dir=None, ref_mlf=None, skip_empty=False, input_chain=None, batch_size=100, no_underscores=True, force_pdt=False, pdt_dir=None): encoding = sys.stdout.encoding if encoding is None: if os.name == 'nt': encoding = 'cp1250' else: encoding = 'iso-8859-2' datasets_fn = pjoin(model_dir, 'datasets') datasets_fr = file(datasets_fn, 'r') datasets = [] isymMaps = [] for i, line in enumerate(datasets_fr): line = line.strip() datasets.append(line) if line != 'off': isymMaps.append( SymMap.readFromFile( pjoin(model_dir, 'isym%d.map' % (i + 1, )))) osymMap = SymMap.readFromFile(pjoin(model_dir, 'osym.map')) if 'signed' in datasets: da_type = 'signed' else: da_type = 'normalized' if not pdt_dir: pdt_dir = '/opt/PDT-2.0/tools/machine-annotation' if xml_dir: reader = input.MultiReader([xml_dir], input.DXMLReader) if force_pdt and 'lemma' in datasets or 'pos' in datasets: if os.name == 'nt': raise RuntimeError( "Datasets 'lemma' and 'pos' are unsupported on Windows" ) reader = input.PDTReader(pdt_dir, reader, online=not batch) else: reader = input.StdInReader(encoding=encoding, type=da_type) if 'lemma' in datasets or 'pos' in datasets: if os.name == 'nt': raise RuntimeError( "Datasets 'lemma' and 'pos' are unsupported on Windows" ) reader = input.PDTReader(pdt_dir, reader, online=not batch) if input_chain is not None: reader = input.InputChain(input_chain, reader) generator = input.InputGenerator(reader, datasets, datasets[0], noUnderscores=no_underscores) hypMLF = MLF() refMLF = MLF() if not batch: for da_fn, da_id, da_semantics, da_txts in generator.readInputs(): da_empty = not bool(da_semantics.strip()) if (da_empty and skip_empty): continue refMLF[da_id] = da_semantics + '\n' dcd = self.parseLine(model_dir, [da_txts], isymMaps, osymMap, omitLeaves=omit_leaves) if dcd: if len(dcd) == 1: hypMLF[da_id] = dcd[0].encode(encoding) + '\n' else: hypMLF[da_id] = ';'.join(dcd).encode(encoding) + '\n' else: hypMLF[da_id] = line + '\n' if not mlf: print hypMLF[da_id], else: all_processed = False inputs = generator.readInputs() while not all_processed: da_count = 0 lines = [] ids = [] for da_fn, da_id, da_semantics, da_txts in inputs: da_empty = not bool(da_semantics.strip()) if (da_empty and skip_empty): continue refMLF[da_id] = da_semantics + '\n' lines.append(da_txts) ids.append(da_id) da_count += 1 if da_count >= batch_size: break else: all_processed = True dcd = self.parseLine(model_dir, lines, isymMaps, osymMap, omitLeaves=omit_leaves) for da_id, ol in zip(ids, dcd): hypMLF[da_id] = ol.encode(encoding) + '\n' if not mlf: print hypMLF[da_id], if mlf: s = ''.join(hypMLF.toLines()) print s if ref_mlf: refMLF.writeToFile(ref_mlf)
def convertStateMap(self, map): ret = SymMap() for k, v in map.iteritems(): ret[self.strState(k)] = v return ret
def makeFMStxt(self, separ, dataLm): isym_map = None isym_fn = os.path.join(dataLm, 'dacoder.fsm.isym') fsm = os.path.join(dataLm, 'dacoder.fsm.txt') fsm_fw = file(fsm, 'w') add = 1 da_map = SymMap.readFromFile(os.path.join(dataLm, 'dialogue_act.fsm.isym')) for da in separ: fn = self.mapTXT(dataLm, da) fn_lm = self.mapLM(dataLm, da) fn_fsm = self.mapFSM(dataLm, da) da_num_op = da_map['operator_'+da] da_num_us = da_map['user_'+da] if isym_map is None: self.convertLMtoFSM(fn_lm, isym_fn) isym_map = SymMap.readFromFile(isym_fn) _empty_ = isym_map.add('_empty_') _operator_ = isym_map.add('_operator_') _user_ = isym_map.add('_user_') for i in separ: isym_map.add('user_%s' % i) isym_map.add('operator_%s' % i) isym_map.writeToFile(isym_fn) s0 = None states = set() for line in self.convertLMtoFSM(fn_lm): # GAWK hack line = line.replace(',', '.') splitted = line.split() if s0 is None: s0 = int(splitted[0])+add print >> fsm_fw, '0\t%d\t%d\t%d\t0' % (s0, _operator_, da_num_op, ) print >> fsm_fw, '0\t%d\t%d\t%d\t0' % (s0, _user_, da_num_us, ) if len(splitted) in (1, 2): state_no = int(splitted[0]) if len(splitted) == 2: weight = float(splitted[1]) else: weight = 0. print >> fsm_fw, '%d\t0\t%d\t0\t%e' % (state_no + add, _empty_, weight) states.add(state_no) elif len(splitted) in (3, 4): state_no_1 = int(splitted[0]) state_no_2 = int(splitted[1]) isym = int(splitted[2]) if len(splitted) == 4: weight = float(splitted[3]) else: weight = 0. print >> fsm_fw, '%d\t%d\t%d\t0\t%e' % (state_no_1+add, state_no_2+add, isym, weight) states.add(state_no_1) states.add(state_no_2) else: raise ValueError("Unknown FSM line: %r" % line) add += max(states)+1 for i in separ: for j in ['user', 'operator']: da = '%s_%s' % (j, i) isym = isym_map[da] osym = da_map[da] print >> fsm_fw, '0\t0\t%d\t%d\t0' % (isym, osym) print >> fsm_fw, '0' fsm_fw.close() da_map.writeToFile(os.path.join(dataLm, 'dacoder.fsm.osym')) FSMCompile('-t', fsm, '-F', os.path.join(dataLm, 'dacoder.fsm'))
def genStates(self): processed = set() backoff_stat = ADict(default=set) osym_map = SymMap() osym_map['epsilon'] = 0 pop_Given_C = self.workspace[gmtk.SCPT, 'popGivenC1C2C3C4'] push_Given_C = self.workspace[gmtk.SCPT, 'pushGivenC1C2C3C4'] c1_Given_C234 = self.workspace[gmtk.SCPT, 'concept1GivenC2C3C4'] c1_Given_C23 = self.workspace[gmtk.SCPT, 'concept1GivenC2C3'] c1_Given_C2 = self.workspace[gmtk.DCPT, 'concept1GivenC2'] c1_backoff = self.workspace[gmtk.DT, 'backoffC2C3C4'] c2_Given_C = self.workspace[gmtk.SCPT, 'concept2GivenC3C4'] s1_Given_C1234 = self.workspace[gmtk.SCPT, 's1GivenC1C2C3C4'] s1_Given_C123 = self.workspace[gmtk.SCPT, 's1GivenC1C2C3'] s1_Given_C12 = self.workspace[gmtk.SCPT, 's1GivenC1C2'] s1_Given_C1 = self.workspace[gmtk.DCPT, 's1GivenC1'] s1_Unigram = self.workspace[gmtk.DCPT, 's1Unigram'] s1_backoff = self.workspace[gmtk.DT, 'backoffC1C2C3C4'] s2_Given_C1234 = self.workspace[gmtk.SCPT, 's2GivenC1C2C3C4'] s2_Given_C123 = self.workspace[gmtk.SCPT, 's2GivenC1C2C3'] s2_Given_C12 = self.workspace[gmtk.SCPT, 's2GivenC1C2'] s2_Given_C1 = self.workspace[gmtk.DCPT, 's2GivenC1'] s2_Unigram = self.workspace[gmtk.DCPT, 's2Unigram'] s3_Given_C1234 = self.workspace[gmtk.SCPT, 's3GivenC1C2C3C4'] s3_Given_C123 = self.workspace[gmtk.SCPT, 's3GivenC1C2C3'] s3_Given_C12 = self.workspace[gmtk.SCPT, 's3GivenC1C2'] s3_Given_C1 = self.workspace[gmtk.DCPT, 's3GivenC1'] s3_Unigram = self.workspace[gmtk.DCPT, 's3Unigram'] conceptMap = self.conceptMap _EMPTY_ = conceptMap[EMPTY_CONCEPT] _DUMMY_ = conceptMap.get(DUMMY_CONCEPT, None) allConcepts = sorted(conceptMap.values()) symbols = [] maps = [] maps2 = [] count = 1 pte_map = SymMap() pte_map2 = SymMap() if self.pteMap: pte_symbols = sorted(self.pteMap.values()) for key, value in sorted(self.pteMap.items()): pte_map[value] = value + count pte_map2[key] = value + count count += len(pte_map) else: pte_symbols = [] for map in self.symbolMaps: if map is None: map = {} symbols.append(sorted(map.values())) new_map = SymMap() new_map2 = SymMap() for key, value in sorted(map.items()): new_map[value] = value + count new_map2[key] = value + count count += len(new_map) maps.append(new_map) maps2.append(new_map2) s0 = (_EMPTY_, ) * 4 s0_expanded = False cutoff_sym = self.cutoff_sym cutoff_trans = self.cutoff_trans max_states = self.max_states logger = self.logger stack = [(0, 0, s0)] stack_set = set([s0]) state_map = SymMap() state_map[s0] = 0 _pop_ = self._pop_ interim_counter = 0 n_arcs = 0 while stack: if max_states is None: total_states = len(state_map) - interim_counter else: total_states = max_states if logger is not None: logger.debug( ' #states (unexpanded/total) %.2f%%, %d/%d, #arcs %d', 100. * len(processed) / total_states, total_states - len(processed), total_states, n_arcs) c_t_backoff, c_t_dist, c_t = stack.pop(0) backoff_stat[c_t_backoff].add(c_t) if logger is not None: logger.debug(' %.2f: %s, backoff=%d', c_t_dist, self.strState(c_t), c_t_backoff) state_c_t = state_map[c_t] processed.add(c_t) stack_set.remove(c_t) ret = [] pop_pmf = list(pop_Given_C[:c_t[0], c_t[1], c_t[2], c_t[3]]) push_pmf = list(push_Given_C[:c_t[0], c_t[1], c_t[2], c_t[3]]) for pop in range(0, MAX_POP + 1): prob_pop = pop_pmf[pop] if prob_pop <= cutoff_trans: continue interim_counter += 1 c_inter = c_t[pop:] + (_EMPTY_, ) * pop osym = ')' * pop if not osym: osym = 'epsilon' ret.append((prob_pop, c_t, (c_t, c_inter), _pop_, osym)) for push in range(0, MAX_PUSH + 1): prob_push = push_pmf[push] if push == 0: to_push_all = [()] else: to_push_all = cartezian(*[allConcepts] * push) for to_push in to_push_all: c_new = (to_push + c_inter)[:DEPTH] if (c_t == c_new) and not (push == pop == 0): continue if _DUMMY_ in c_new[1:]: continue # Output symbol osym = '' for push_concept in reversed(to_push): osym += conceptMap.inverse[push_concept] + '(' if not osym: osym = 'epsilon' # Smoothing backoff = c1_backoff[c_new[1], c_new[2], c_new[3]] if backoff == 0: c1_pmf = c1_Given_C234[:c_new[1], c_new[2], c_new[3]] elif backoff == 1: c1_pmf = c1_Given_C23[:c_new[1], c_new[2]] else: c1_pmf = c1_Given_C2[:c_new[1]] c2_pmf = c2_Given_C[:c_new[2], c_new[3]] if push == 0: prob_new_c = 1.0 elif push == 1: prob_new_c = c1_pmf[to_push[0]] elif push == 2: prob_new_c = c1_pmf[to_push[0]] * c2_pmf[ to_push[1]] prob_trans = prob_push * prob_new_c # Do cut-off if prob_trans <= cutoff_trans: continue # Smoothing backoff = s1_backoff[c_new[0], c_new[1], c_new[2], c_new[3]] if backoff == 0: s_pmf = [ list(s1_Given_C1234[:c_new[0], c_new[1], c_new[2], c_new[3]]), list(s2_Given_C1234[:c_new[0], c_new[1], c_new[2], c_new[3]]), list(s3_Given_C1234[:c_new[0], c_new[1], c_new[2], c_new[3]]) ] elif backoff == 1: s_pmf = [ list(s1_Given_C123[:c_new[0], c_new[1], c_new[2]]), list(s2_Given_C123[:c_new[0], c_new[1], c_new[2]]), list(s3_Given_C123[:c_new[0], c_new[1], c_new[2]]) ] elif backoff == 2: s_pmf = [ list(s1_Given_C12[:c_new[0], c_new[1]]), list(s2_Given_C12[:c_new[0], c_new[1]]), list(s3_Given_C12[:c_new[0], c_new[1]]) ] elif backoff == 3: s_pmf = [ list(s1_Given_C1[:c_new[0]]), list(s2_Given_C1[:c_new[0]]), list(s3_Given_C1[:c_new[0]]) ] else: s_pmf = [ list(s1_Unigram), list(s2_Unigram), list(s3_Unigram) ] if c_new not in processed and c_new not in stack_set: stack_set.add(c_new) c_new_dist = (c_t_dist - log(prob_trans)) insort( stack, (backoff, c_t_dist - log(prob_trans), c_new)) c_next = (c_t, c_inter) if pte_symbols and c_inter == ( _EMPTY_, ) * 4 and push != 0: for pte_sym in pte_symbols: prob_ptesym = 1.0 pte_sym = pte_map[pte_sym] pte_osym = pte_map2.inverse[pte_sym] ret.append((prob_trans * prob_ptesym, c_next, c_new, pte_sym, pte_osym)) prob_trans = 1.0 c_next = c_new for sym, map, pmf in zip(symbols, maps, s_pmf): if map is None: continue for isym in sym: prob_isym = pmf[isym] # Do cut-off if prob_isym <= cutoff_sym: continue else: isym = map[isym] ret.append((prob_trans * prob_isym, c_next, c_new, isym, osym)) # For symbols other than the first prob_trans = 1.0 c_next = c_new osym = 'epsilon' for prob, c_t, c_new, isym, osym in ret: state_c_new = state_map.add(c_new) state_c_t = state_map.add(c_t) osym = osym_map.add(osym) n_arcs += 1 yield state_c_t, state_c_new, isym, osym, prob if max_states is not None and len(processed) >= max_states: break self.stateMap = self.convertStateMap(state_map) self.osymMap = osym_map self.isymMaps = maps2 self.ipteMap = pte_map2 backoff_stat = ADict( (k, len(v)) for (k, v) in backoff_stat.iteritems()) if logger is not None: logger.debug('Backoff statistics:') logger.debug('===================') total = backoff_stat.sum() for key, value in sorted(backoff_stat.items()): logger.debug(' backoff=%d: #%d (%.2f%%)', key, value, 100. * value / total)
def fsmconvert(self, pteMapFn=None): sys.path.append('src') import fsm from svc.ui import gmtk max_states = int(self.settings['FSM_STATES']) cutoff_sym = float(self.settings['FSM_CUTOFF_SYM']) cutoff_trans = float(self.settings['FSM_CUTOFF_TRANS']) self.setCommonParams() FSM_DIR = self.settings['FSM_DIR'] mkdirp(FSM_DIR) conceptMapFn = self.settings['CONCEPT_MAP'] self.logger.debug("Reading concept map: %s", conceptMapFn) conceptMap = SymMap.readFromFile(conceptMapFn, format=(int, unicode)).inverse del conceptMap['_SINK_'] #conceptMap = SymMap((k, v) for (k, v) in conceptMap.iteritems() if k in '_EMPTY_ GREETING DEPARTURE'.split()) #conceptMap = SymMap((k, v) for (k, v) in conceptMap.iteritems() if k in '_EMPTY_ GREETING'.split()) dataset_fn = os.path.join(FSM_DIR, 'datasets') dataset_fw = file(dataset_fn, 'w') sMaps = [] for ds in [1, 2, 3]: ds_value = self.settings['S%d_DATASET' % ds] if ds_value != 'off': mapFn = self.settings['S%d_MAP'% ds] self.logger.debug("Reading s%d map: %s", ds, mapFn) map = SymMap.readFromFile(mapFn, format=(int, unicode)).inverse #map = SymMap((k, v) for (k, v) in map.iteritems() if k in u'dobrý den kdy jede _empty_ _unseen_'.split()) sMaps.append(map) else: self.logger.debug("Dataset s%d is turned off", ds) sMaps.append(None) dataset_fw.write(ds_value + '\n') dataset_fw.close() if pteMapFn is not None: self.logger.debug("Reading pte map: %s", pteMapFn) pteMap = SymMap.readFromFile(pteMapFn, format=(unicode, int)) else: pteMap = {} pteSymbols = pteMap.keys() mstr = os.path.join(self.settings['MSTR_DCD_DIR'], 'in.mstr') cppOptions = self.settings['CPP_OPTIONS'].split() workspace = gmtk.Workspace(cppOptions=cppOptions, readDTS=False) self.logger.info('Reading master file: %s', mstr) workspace.readMasterFile(mstr) self.logger.info('Creating FSM from arcs') self.logger.info('Total number of concepts: %d', len(conceptMap)) #self.logger.info('Total number of symbols: %d', len(s1Map)) stateGenerator = fsm.FSMGenerator(workspace, conceptMap, sMaps, cutoff_sym, cutoff_trans, max_states, pteSymbols=pteSymbols, logger=self.logger) stateGenerator.writeFSMRepeater(os.path.join(FSM_DIR, 'hvsrepeater.txt')) stateGenerator.writeFSMPadder(os.path.join(FSM_DIR, 'hvspadder.txt')) stateGenerator.writeFSM(os.path.join(FSM_DIR, 'hvsparser_pad.txt')) stateGenerator.stateMap.writeToFile(os.path.join(FSM_DIR, 'state.map')) stateGenerator.osymMap.writeToFile(os.path.join(FSM_DIR, 'osym.map')) for i, map in enumerate(stateGenerator.isymMaps): map.writeToFile(os.path.join(FSM_DIR, 'isym%d.map' % (i+1, ))) stateGenerator.ipteMap.writeToFile(os.path.join(FSM_DIR, 'pte.map')) self.fsmcompile()
def _readMaps(cls, fsm_fn, encoding='utf-8'): map_fn = os.path.splitext(fsm_fn)[0] + '.isym' map = SymMap.readFromFile(map_fn, encoding=encoding) return map
def genStates(self): processed = set() backoff_stat = ADict(default=set) osym_map = SymMap() osym_map['epsilon'] = 0 pop_Given_C = self.workspace[gmtk.SCPT, 'popGivenC1C2C3C4'] push_Given_C = self.workspace[gmtk.SCPT, 'pushGivenC1C2C3C4'] c1_Given_C234 = self.workspace[gmtk.SCPT, 'concept1GivenC2C3C4'] c1_Given_C23 = self.workspace[gmtk.SCPT, 'concept1GivenC2C3'] c1_Given_C2 = self.workspace[gmtk.DCPT, 'concept1GivenC2'] c1_backoff = self.workspace[gmtk.DT, 'backoffC2C3C4'] c2_Given_C = self.workspace[gmtk.SCPT, 'concept2GivenC3C4'] s1_Given_C1234 = self.workspace[gmtk.SCPT, 's1GivenC1C2C3C4'] s1_Given_C123 = self.workspace[gmtk.SCPT, 's1GivenC1C2C3'] s1_Given_C12 = self.workspace[gmtk.SCPT, 's1GivenC1C2'] s1_Given_C1 = self.workspace[gmtk.DCPT, 's1GivenC1'] s1_Unigram = self.workspace[gmtk.DCPT, 's1Unigram'] s1_backoff = self.workspace[gmtk.DT, 'backoffC1C2C3C4'] s2_Given_C1234 = self.workspace[gmtk.SCPT, 's2GivenC1C2C3C4'] s2_Given_C123 = self.workspace[gmtk.SCPT, 's2GivenC1C2C3'] s2_Given_C12 = self.workspace[gmtk.SCPT, 's2GivenC1C2'] s2_Given_C1 = self.workspace[gmtk.DCPT, 's2GivenC1'] s2_Unigram = self.workspace[gmtk.DCPT, 's2Unigram'] s3_Given_C1234 = self.workspace[gmtk.SCPT, 's3GivenC1C2C3C4'] s3_Given_C123 = self.workspace[gmtk.SCPT, 's3GivenC1C2C3'] s3_Given_C12 = self.workspace[gmtk.SCPT, 's3GivenC1C2'] s3_Given_C1 = self.workspace[gmtk.DCPT, 's3GivenC1'] s3_Unigram = self.workspace[gmtk.DCPT, 's3Unigram'] conceptMap = self.conceptMap _EMPTY_ = conceptMap[EMPTY_CONCEPT] _DUMMY_ = conceptMap.get(DUMMY_CONCEPT, None) allConcepts = sorted(conceptMap.values()) symbols = [] maps = [] maps2 = [] count = 1 pte_map = SymMap() pte_map2 = SymMap() if self.pteMap: pte_symbols = sorted(self.pteMap.values()) for key, value in sorted(self.pteMap.items()): pte_map[value] = value+count pte_map2[key] = value+count count += len(pte_map) else: pte_symbols = [] for map in self.symbolMaps: if map is None: map = {} symbols.append(sorted(map.values())) new_map = SymMap() new_map2 = SymMap() for key, value in sorted(map.items()): new_map[value] = value+count new_map2[key] = value+count count += len(new_map) maps.append(new_map) maps2.append(new_map2) s0 = (_EMPTY_,)*4 s0_expanded = False cutoff_sym = self.cutoff_sym cutoff_trans = self.cutoff_trans max_states = self.max_states logger = self.logger stack = [(0, 0, s0)] stack_set = set([s0]) state_map = SymMap() state_map[s0] = 0 _pop_ = self._pop_ interim_counter = 0 n_arcs = 0 while stack: if max_states is None: total_states = len(state_map) - interim_counter else: total_states = max_states if logger is not None: logger.debug(' #states (unexpanded/total) %.2f%%, %d/%d, #arcs %d', 100.*len(processed)/total_states, total_states-len(processed), total_states, n_arcs) c_t_backoff, c_t_dist, c_t = stack.pop(0) backoff_stat[c_t_backoff].add(c_t) if logger is not None: logger.debug(' %.2f: %s, backoff=%d', c_t_dist, self.strState(c_t), c_t_backoff) state_c_t = state_map[c_t] processed.add(c_t) stack_set.remove(c_t) ret = [] pop_pmf = list(pop_Given_C[: c_t[0], c_t[1], c_t[2], c_t[3]]) push_pmf = list(push_Given_C[: c_t[0], c_t[1], c_t[2], c_t[3]]) for pop in range(0, MAX_POP+1): prob_pop = pop_pmf[pop] if prob_pop <= cutoff_trans: continue interim_counter += 1 c_inter = c_t[pop:] + (_EMPTY_, ) * pop osym = ')'*pop if not osym: osym = 'epsilon' ret.append( (prob_pop, c_t, (c_t, c_inter), _pop_, osym) ) for push in range(0, MAX_PUSH+1): prob_push = push_pmf[push] if push == 0: to_push_all = [()] else: to_push_all = cartezian(*[allConcepts]*push) for to_push in to_push_all: c_new = (to_push + c_inter)[:DEPTH] if (c_t == c_new) and not (push == pop == 0): continue if _DUMMY_ in c_new[1:]: continue # Output symbol osym = '' for push_concept in reversed(to_push): osym += conceptMap.inverse[push_concept]+'(' if not osym: osym = 'epsilon' # Smoothing backoff = c1_backoff[c_new[1], c_new[2], c_new[3]] if backoff == 0: c1_pmf = c1_Given_C234[: c_new[1], c_new[2], c_new[3]] elif backoff == 1: c1_pmf = c1_Given_C23[: c_new[1], c_new[2]] else: c1_pmf = c1_Given_C2[: c_new[1]] c2_pmf = c2_Given_C[: c_new[2], c_new[3]] if push == 0: prob_new_c = 1.0 elif push == 1: prob_new_c = c1_pmf[to_push[0]] elif push == 2: prob_new_c = c1_pmf[to_push[0]] * c2_pmf[to_push[1]] prob_trans = prob_push * prob_new_c # Do cut-off if prob_trans <= cutoff_trans: continue # Smoothing backoff = s1_backoff[c_new[0], c_new[1], c_new[2], c_new[3]] if backoff == 0: s_pmf = [list(s1_Given_C1234[: c_new[0], c_new[1], c_new[2], c_new[3]]), list(s2_Given_C1234[: c_new[0], c_new[1], c_new[2], c_new[3]]), list(s3_Given_C1234[: c_new[0], c_new[1], c_new[2], c_new[3]])] elif backoff == 1: s_pmf = [list(s1_Given_C123[: c_new[0], c_new[1], c_new[2]]), list(s2_Given_C123[: c_new[0], c_new[1], c_new[2]]), list(s3_Given_C123[: c_new[0], c_new[1], c_new[2]])] elif backoff == 2: s_pmf = [list(s1_Given_C12[: c_new[0], c_new[1]]), list(s2_Given_C12[: c_new[0], c_new[1]]), list(s3_Given_C12[: c_new[0], c_new[1]])] elif backoff == 3: s_pmf = [list(s1_Given_C1[: c_new[0]]), list(s2_Given_C1[: c_new[0]]), list(s3_Given_C1[: c_new[0]])] else: s_pmf = [list(s1_Unigram), list(s2_Unigram), list(s3_Unigram)] if c_new not in processed and c_new not in stack_set: stack_set.add(c_new) c_new_dist = (c_t_dist-log(prob_trans)) insort(stack, (backoff, c_t_dist-log(prob_trans), c_new)) c_next = (c_t, c_inter) if pte_symbols and c_inter == (_EMPTY_,)*4 and push != 0: for pte_sym in pte_symbols: prob_ptesym = 1.0 pte_sym = pte_map[pte_sym] pte_osym = pte_map2.inverse[pte_sym] ret.append( (prob_trans*prob_ptesym, c_next, c_new, pte_sym, pte_osym) ) prob_trans = 1.0 c_next = c_new for sym, map, pmf in zip(symbols, maps, s_pmf): if map is None: continue for isym in sym: prob_isym = pmf[isym] # Do cut-off if prob_isym <= cutoff_sym: continue else: isym = map[isym] ret.append( (prob_trans*prob_isym, c_next, c_new, isym, osym) ) # For symbols other than the first prob_trans = 1.0 c_next = c_new osym = 'epsilon' for prob, c_t, c_new, isym, osym in ret: state_c_new = state_map.add(c_new) state_c_t = state_map.add(c_t) osym = osym_map.add(osym) n_arcs += 1 yield state_c_t, state_c_new, isym, osym, prob if max_states is not None and len(processed) >= max_states: break self.stateMap = self.convertStateMap(state_map) self.osymMap = osym_map self.isymMaps = maps2 self.ipteMap = pte_map2 backoff_stat = ADict((k, len(v)) for (k,v) in backoff_stat.iteritems()) if logger is not None: logger.debug('Backoff statistics:') logger.debug('===================') total = backoff_stat.sum() for key, value in sorted(backoff_stat.items()): logger.debug(' backoff=%d: #%d (%.2f%%)', key, value, 100.*value/total)
def _emptyMaps(cls): map = SymMap() map[cls.eps] = 0 return map
def loadMaps(self, maps): ret = [] for m in maps: fn = pjoin(self.fsm_dir, m) ret.append( SymMap.readFromFile(fn) ) return ret
def _readMaps(cls, fsm_fn, encoding='utf-8'): map_fn = os.path.splitext(fsm_fn)[0]+'.isym' map = SymMap.readFromFile(map_fn, encoding=encoding) return map