def makeZeroOrder(self, allCounts): minCount, discount = self.parametrizeOrder(0) counts = sumLotsOfCounts(itertools.imap(lambda item : item[1], allCounts)) effectiveCounts, total = self.effectiveCounts(counts, minCount, discount) effectiveTotal = effectiveCounts.sum() seenWords = set([w for w, n in effectiveCounts]) assert self.sentenceStart not in seenWords unseenWords = set(self.predictedWords) - seenWords assert self.sentenceStart not in unseenWords self.log('number of unseen words', len(unseenWords)) pZero = 1 / len(self.predictedWords) backOffMass = total - effectiveTotal nZero = backOffMass * pZero interpolatedCounts = [] for predicted, effectiveCount in effectiveCounts: interpolatedCounts.append((predicted, effectiveCount + nZero)) for predicted in unseenWords: interpolatedCounts.append((predicted, nZero)) interpolatedCounts = Counts(interpolatedCounts) self.log('%d predicted events' % (interpolatedCounts.size)) return [(MGram(()), (interpolatedCounts, total))]
def makeZeroOrder(self, allCounts): minCount, discount = self.parametrizeOrder(0) counts = sumLotsOfCounts(map(lambda item: item[1], allCounts)) effectiveCounts, total = self.effectiveCounts(counts, minCount, discount) effectiveTotal = effectiveCounts.sum() seenWords = set([w for w, n in effectiveCounts]) assert self.sentenceStart not in seenWords unseenWords = set(self.predictedWords) - seenWords assert self.sentenceStart not in unseenWords self.log('number of unseen words', len(unseenWords)) pZero = 1 / len(self.predictedWords) backOffMass = total - effectiveTotal nZero = backOffMass * pZero interpolatedCounts = [] for predicted, effectiveCount in effectiveCounts: interpolatedCounts.append((predicted, effectiveCount + nZero)) for predicted in unseenWords: interpolatedCounts.append((predicted, nZero)) interpolatedCounts = Counts(interpolatedCounts) self.log('%d predicted events' % (interpolatedCounts.size)) return [(MGram(()), (interpolatedCounts, total))]
def loadP2PSample(compfname): fnames = compfname.split(':') assert len(fnames) == 2 left = dict(loadG2PSample(fnames[0])) right = dict(loadG2PSample(fnames[1])) sample = [] for w in set(left.keys()) & set(right.keys()): sample.append((left[w], right[w])) return sample
def rampUp(self): data = self.get() histories = set([history for history, predicted, score in data]) newHistories = set() for history, predicted, score in data: if predicted is None: continue newHistory = history + (predicted, ) if newHistory not in histories: newHistories.add(newHistory) for newHistory in newHistories: data.append((newHistory, None, 0.0)) self.set(data)
def rampUp(self): data = self.get() histories = set([ history for history, predicted, score in data ]) newHistories = set() for history, predicted, score in data: if predicted is None: continue newHistory = history + (predicted,) if newHistory not in histories: newHistories.add(newHistory) for newHistory in newHistories: data.append((newHistory, None, 0.0)) self.set(data)
def addSupervised(self, lexicon=None): """ Caveat: supervised splitting might come up with graphones that are NOT present in the model g2p, because they were trimmed! Therefore this function may modify the sequitur inventory. """ segmenter = Segmenter(self.model) fragments = set() for orth, phon in lexicon: logLik, joint = segmenter.firstBestJoint(orth, phon) for fragment in joint: fragments.add(fragment) joint = [ lmToken(gra, pho) for gra, pho in joint ] if orth not in self.memory: self.memory[orth] = [] self.memory[orth].append(joint) oldSize, newSize = self.model.strip() print('stripped number of multigrams from %d to %d' % (oldSize, newSize)) sequitur = self.model.sequitur for gra, pho in fragments: fragment = ( sequitur.leftInventory.parse(gra), sequitur.rightInventory.parse(pho) ) sequitur.inventory.index(fragment) self.translator.setModel(self.model)
def addSupervised(self, lexicon=None): """ Caveat: supervised splitting might come up with graphones that are NOT present in the model g2p, because they were trimmed! Therefore this function may modify the sequitur inventory. """ segmenter = Segmenter(self.model) fragments = set() for orth, phon in lexicon: logLik, joint = segmenter.firstBestJoint(orth, phon) for fragment in joint: fragments.add(fragment) joint = [ lmToken(gra, pho) for gra, pho in joint ] if orth not in self.memory: self.memory[orth] = [] self.memory[orth].append(joint) oldSize, newSize = self.model.strip() print 'stripped number of multigrams from %d to %d' % (oldSize, newSize) sequitur = self.model.sequitur for gra, pho in fragments: fragment = ( sequitur.leftInventory.parse(gra), sequitur.rightInventory.parse(pho) ) sequitur.inventory.index(fragment) self.translator.setModel(self.model)
class EventGenerator: specialEvents = set([ '<s>', '</s>' ]) def __init__(self, knownWords, fragmentizer, order): self.knownWords = set(knownWords) self.fragmentizer = fragmentizer self.order = order self.rotor = RotatingDict() def fragmentize(self, word): if word not in self.rotor: self.rotor[word] = tuple(self.fragmentizer(word)) return self.rotor[word] def frobnicate(self, rawWords): raise NotImplementedError def __call__(self, source): for line in source: words = line.split() if words[0] != '<s>': assert words[-1] != '</s>' words = ['<s>'] + words + ['</s>'] for event in self.frobnicate(words): yield event, 1
def setVocabulary(self, vocabulary): self.vocabulary = vocabulary self.sentenceStart = vocabulary.index('<s>') predictedWords = set(self.vocabulary.indices()) predictedWords.remove(self.sentenceStart) predictedWords.remove(self.vocabulary.noneIndex) self.predictedWords = list(predictedWords) self.predictedWords.sort()
def masterSequenceModel(self, model): allHistories = set() for history, predicted, score in model.sequenceModel.get(): allHistories.add(history) result = SequenceModel.SequenceModel() result.setInitAndTerm(self.sequitur.term, self.sequitur.term) result.set([ (history, None, 0.0) for history in allHistories ]) return result
def masterSequenceModel(self, model): allHistories = set() for history, predicted, score in model.sequenceModel.get(): allHistories.add(history) result = SequenceModel.SequenceModel() result.setInitAndTerm(self.sequitur.term, self.sequitur.term) result.set([(history, None, 0.0) for history in allHistories]) return result
def wipeOut(self, vocabularySize): histories = set() for history, predicted, score in self.get(): histories.add(history) histories.remove(()) data = [((), None, math.log(vocabularySize))] for history in histories: data.append((history, None, 0.0)) self.set(data)
def rampUp(self): newHistories = set() for (history, predicted), probability in self.prob.iteritems(): if predicted is None: continue newHistory = history + (predicted,) if (newHistory, None) not in self.prob: newHistories.add(newHistory) for newHistory in newHistories: self.prob[(newHistory, None)] = 1.0 self.compiled = None
def rampUp(self): newHistories = set() for (history, predicted), probability in self.prob.items(): if predicted is None: continue newHistory = history + (predicted, ) if (newHistory, None) not in self.prob: newHistories.add(newHistory) for newHistory in newHistories: self.prob[(newHistory, None)] = 1.0 self.compiled = None
def mainTest(translator, testSample, options): if options.shouldTranspose: testSample = SequiturTool.transposeSample(testSample) if options.testResult: resultFile = gOpenOut(options.testResult, defaultEncoding) else: resultFile = None from Evaluation import Evaluator evaluator = Evaluator() evaluator.setSample(testSample) evaluator.resultFile = resultFile evaluator.verboseLog = stdout if options.test_segmental: supraSegmental = set(['.', "'", '"']) def removeSupraSegmental(phon): return filter(lambda p: p not in supraSegmental, phon) evaluator.compareFilter = removeSupraSegmental result = evaluator.evaluate(translator) print >> stdout, result
class OovFragmentGenerator: specialEvents = set(["<s>", "</s>"]) def __init__(self, knownWords, fragmentizer): self.knownWords = set(knownWords) self.fragmentizer = fragmentizer self.rotor = RotatingDict() self.fragmentDict = {} def fragmentize(self, word): if word not in self.rotor: self.rotor[word] = tuple(self.fragmentizer(word)) return self.rotor[word] def __call__(self, source): for line in source: words = line.split() self.frobnicate(words) return self.fragmentDict def frobnicate(self, rawWords): for w in rawWords: if w in self.knownWords: continue if w in self.specialEvents: continue if w in self.fragmentDict.keys(): continue fragments = self.fragmentize(w) self.fragmentDict[w] = fragments def modifyLmText(self, rawWords): modWords = [] for w in rawWords: if w in self.knownWords: modWords.append(w) elif w in self.specialEvents: modWords.append(w) else: fragments = self.fragmentize(w) modWords.append(" ".join(fragments)) return modWords
def main(options, args): # 1. load reference lexicon print 'loading reference lexicon ...' lexicon = loadBlissLexicon(options.lexicon) knownWords = set([ orth for orth, phon in lexicon ]) # 2. load model for fragmentizing unknown words if options.subliminal_lexicon: print 'loading subliminal lexicon ...' subliminalLexicon = loadBlissLexicon(options.subliminal_lexicon) else: subliminalLexicon = None if options.subliminal_g2p: print 'loading subliminal g2p model ...' subliminalG2p = pickle.load(open(options.subliminal_g2p)) else: subliminalG2p = None if options.g2pModel: print 'loading g2p model ...' model = pickle.load(open(options.g2pModel)) oldSize, newSize = model.strip() print 'stripped number of multigrams from %d to %d' % (oldSize, newSize) fragmentizer = Fragmentizer(model) if subliminalLexicon: fragmentizer.addSupervised(subliminalLexicon) if subliminalG2p: fragmentizer.addSupervised(subliminalG2p) graphones = model.sequitur.symbols() graphones.remove(model.sequitur.symbol(model.sequitur.term)) else: model = fragmentizer = graphones = None # 3. add fragments to lexicon if options.write_lexicon: print 'creating extended lexicon ...' xmlLexicon = ElementTree(file = options.lexicon) if options.model_type == 'phonemes': changeSyntaticToPhonetic(xmlLexicon) else: addGraphonesToLexicon(xmlLexicon, graphones) xmlLexicon.write(gOpenOut(options.write_lexicon), defaultEncoding) # 4. determine set of LM tokens vocabulary = mGramCounts.ClosedVocablary() vocabulary.add(['<s>', '</s>']) if options.model_type == 'flat-hybrid': vocabulary.add(ifilter(isLmToken, knownWords), soft=True) if graphones: vocabulary.add(starmap(lmToken, graphones)) vocabulary.sort() if options.write_tokens: f = gOpenOut(options.write_tokens, defaultEncoding) if options.model_type == 'phonemes': phonemes = set(p for orth, phon in lexicon for p in phon) phonemes.add('#1') if 'si' in phonemes: phonemes.remove('si') for p in sorted(phonemes): print >> f, p else: for w in vocabulary: if w is not None: print >> f, w # 5./6. set-up LM event generator if options.write_counts or options.write_events: order = options.order - 1 if options.model_type == 'flat-hybrid': events = HybridEventGenerator(knownWords, fragmentizer, order) if options.range_type == 'fragments': events.setFragmentRange() elif options.range_type == 'words': events.setTrueWordRange() else: assert ValueError(options.range_type) elif options.model_type == 'fragments': events = OovEventGenerator(knownWords, fragmentizer, order) elif options.model_type == 'phonemes': events = PhonemeEventGenerator(lexicon, order) # 5. create modified LM training corpus counts if options.write_events: print 'creating sequence model events ...' f = gOpenOut(options.write_events, defaultEncoding) for event, count in events(gOpenIn(options.text, defaultEncoding)): print >> f, repr(event), '\t', count # 6. count LM events if options.write_counts: print 'creating sequence model counts ...' counts = mGramCounts.SimpleMultifileStorage() counts.addIter(events(gOpenIn(options.text, defaultEncoding))) mGramCounts.TextStorage.write(gOpenOut(options.write_counts, defaultEncoding), counts) # 7. dump list of OOV words and their corresponding fragmentation if options.write_fragments: print 'dumping fragments ...' f = gOpenOut(options.write_fragments, defaultEncoding) events = OovFragmentGenerator(knownWords, fragmentizer) fragments = events(gOpenIn(options.text, defaultEncoding)) for event in fragments.keys(): print >> f, event, '\t', ' '.join(fragments[event]) # 8. dump modified LM training text if options.write_lm_text: print 'dumping modified LM training text ...' f = gOpenOut(options.write_lm_text, defaultEncoding) events = OovFragmentGenerator(knownWords, fragmentizer) for line in gOpenIn(options.text, defaultEncoding): words = line.split() modWords = events.modifyLmText(words) print >> f, " ".join(modWords)
def sizeTemplates(self): result = set() for i in range(1, self.size() + 1): left, right = self.symbol(i) result.add((len(left), len(right))) return sorted(result)
def main(options, args): # 1. load reference lexicon print('loading reference lexicon ...') lexicon = loadBlissLexicon(options.lexicon) knownWords = set([ orth for orth, phon in lexicon ]) # 2. load model for fragmentizing unknown words if options.subliminal_lexicon: print('loading subliminal lexicon ...') subliminalLexicon = loadBlissLexicon(options.subliminal_lexicon) else: subliminalLexicon = None if options.subliminal_g2p: print('loading subliminal g2p model ...') subliminalG2p = pickle.load(open(options.subliminal_g2p)) else: subliminalG2p = None if options.g2pModel: print('loading g2p model ...') model = pickle.load(open(options.g2pModel)) oldSize, newSize = model.strip() print('stripped number of multigrams from %d to %d' % (oldSize, newSize)) fragmentizer = Fragmentizer(model) if subliminalLexicon: fragmentizer.addSupervised(subliminalLexicon) if subliminalG2p: fragmentizer.addSupervised(subliminalG2p) graphones = model.sequitur.symbols() graphones.remove(model.sequitur.symbol(model.sequitur.term)) else: model = fragmentizer = graphones = None # 3. add fragments to lexicon if options.write_lexicon: print('creating extended lexicon ...') xmlLexicon = ElementTree(file = options.lexicon) if options.model_type == 'phonemes': changeSyntaticToPhonetic(xmlLexicon) else: addGraphonesToLexicon(xmlLexicon, graphones) xmlLexicon.write(gOpenOut(options.write_lexicon), defaultEncoding) # 4. determine set of LM tokens vocabulary = mGramCounts.ClosedVocablary() vocabulary.add(['<s>', '</s>']) if options.model_type == 'flat-hybrid': vocabulary.add(filter(isLmToken, knownWords), soft=True) if graphones: vocabulary.add(starmap(lmToken, graphones)) vocabulary.sort() if options.write_tokens: f = gOpenOut(options.write_tokens, defaultEncoding) if options.model_type == 'phonemes': phonemes = set(p for orth, phon in lexicon for p in phon) phonemes.add('#1') if 'si' in phonemes: phonemes.remove('si') for p in sorted(phonemes): print(p, file=f) else: for w in vocabulary: if w is not None: print(w, file=f) # 5./6. set-up LM event generator if options.write_counts or options.write_events: order = options.order - 1 if options.model_type == 'flat-hybrid': events = HybridEventGenerator(knownWords, fragmentizer, order) if options.range_type == 'fragments': events.setFragmentRange() elif options.range_type == 'words': events.setTrueWordRange() else: assert ValueError(options.range_type) elif options.model_type == 'fragments': events = OovEventGenerator(knownWords, fragmentizer, order) elif options.model_type == 'phonemes': events = PhonemeEventGenerator(lexicon, order) # 5. create modified LM training corpus counts if options.write_events: print('creating sequence model events ...') f = gOpenOut(options.write_events, defaultEncoding) for event, count in events(gOpenIn(options.text, defaultEncoding)): print(repr(event), '\t', count, file=f) # 6. count LM events if options.write_counts: print('creating sequence model counts ...') counts = mGramCounts.SimpleMultifileStorage() counts.addIter(events(gOpenIn(options.text, defaultEncoding))) mGramCounts.TextStorage.write(gOpenOut(options.write_counts, defaultEncoding), counts) # 7. dump list of OOV words and their corresponding fragmentation if options.write_fragments: print('dumping fragments ...') f = gOpenOut(options.write_fragments, defaultEncoding) events = OovFragmentGenerator(knownWords, fragmentizer) fragments = events(gOpenIn(options.text, defaultEncoding)) for event in list(fragments.keys()): print(event, '\t', ' '.join(fragments[event]), file=f) # 8. dump modified LM training text if options.write_lm_text: print('dumping modified LM training text ...') f = gOpenOut(options.write_lm_text, defaultEncoding) events = OovFragmentGenerator(knownWords, fragmentizer) for line in gOpenIn(options.text, defaultEncoding): words = line.split() modWords = events.modifyLmText(words) print(" ".join(modWords), file=f)
def __init__(self, knownWords, fragmentizer): self.knownWords = set(knownWords) self.fragmentizer = fragmentizer self.rotor = RotatingDict() self.fragmentDict = {}
import sys import codecs import cPickle as pickle from elementtree.ElementTree import ElementTree, Element, Comment, SubElement from itertools import ifilter, starmap import mGramCounts from sequitur import Segmenter, Translator from g2p import loadBlissLexicon from misc import gOpenIn, gOpenOut, set, reversed # =========================================================================== nonLmTokens = set(""" "QUOTE "UNQUOTE "BEGIN-QUOTE "END-QUOTE %PERCENT .POINT /SLASH """.split()) def isLmToken(word): return word not in nonLmTokens # =========================================================================== def lmToken(letters, phonemes): return '*' + ''.join(letters) + ':' + '_'.join(phonemes) + '*' def addGraphonesToLexicon(xml, graphones): lexicon = xml.getroot() for letters, phonemes in graphones:
import sys import codecs import pickle as pickle from elementtree.ElementTree import ElementTree, Element, Comment, SubElement from itertools import starmap import mGramCounts from sequitur import Segmenter, Translator from g2p import loadBlissLexicon from misc import gOpenIn, gOpenOut, set, reversed # =========================================================================== nonLmTokens = set(""" "QUOTE "UNQUOTE "BEGIN-QUOTE "END-QUOTE %PERCENT .POINT /SLASH """.split()) def isLmToken(word): return word not in nonLmTokens # =========================================================================== def lmToken(letters, phonemes): return '*' + ''.join(letters) + ':' + '_'.join(phonemes) + '*' def addGraphonesToLexicon(xml, graphones): lexicon = xml.getroot() for letters, phonemes in graphones: