def readApply(fname): for line in gOpenIn(fname, defaultEncoding): word = line.strip() #print word left = tuple(word) #print line yield word, left
def iter(self, sorted=True, consolidated=True): for line in gOpenIn(self.fname): fields = line.split() mGram = map(self.inputConversion, fields[:-1]) mGram.reverse() item = (tuple(mGram[1:]), mGram[0]) value = self.value(fields[-1]) yield item, value
def loadPlainSample(fname, encoding = None): sample = [] for line in gOpenIn(fname, encoding or defaultEncoding): fields = line.split() if not fields: continue left = tuple(fields[0]) right = tuple(fields[1:]) sample.append((left, right)) return sample
def loadPlainSample(fname, encoding=None): sample = [] for line in gOpenIn(fname, encoding or defaultEncoding): fields = line.split() if not fields: continue left = tuple(fields[0]) right = tuple(fields[1:]) sample.append((left, right)) return sample
def loadBlissLexicon(fname): from elementtree.ElementTree import ElementTree xml = ElementTree(file=gOpenIn(fname)) pronunciations = pronunciationsFromXmlLexicon(xml) result = [(orth, phon) for orth in pronunciations if not (orth.startswith('[') and orth.endswith(']')) for phon in pronunciations[orth]] result.sort() return result
def loadBlissLexicon(fname): from elementtree.ElementTree import ElementTree xml = ElementTree(file = gOpenIn(fname)) pronunciations = pronunciationsFromXmlLexicon(xml) result = [ (orth, phon) for orth in pronunciations if not (orth.startswith('[') and orth.endswith(']')) for phon in pronunciations[orth] ] result.sort() return result
def templateTestRawCounts(self, StorageClass): text = misc.gOpenIn('tests/nab-mini-corpus.txt.gz') sentences = itertools.imap(str.split, text) grams = mGramsChainCount(sentences, self.order) counts = StorageClass() counts.addIter(grams) f = EqualFile('tests/nab-mini-corpus.raw-counts.gz') TextStorage.write(f, counts) self.failUnless(f)
def templateTestRawCounts(self, StorageClass): text = misc.gOpenIn('tests/nab-mini-corpus.txt.gz') sentences = map(str.split, text) grams = mGramsChainCount(sentences, self.order) counts = StorageClass() counts.addIter(grams) f = EqualFile('tests/nab-mini-corpus.raw-counts.gz') TextStorage.write(f, counts) self.assertTrue(f)
def loadG2PSample(fname): if fname == '-': sample = loadPlainSample(fname) else: firstLine = gOpenIn(fname, defaultEncoding).readline() if firstLine.startswith('<?xml'): sample = [(tuple(orth), tuple(phon)) for orth, phon in loadBlissLexicon(fname)] else: sample = loadPlainSample(fname) return sample
def loadG2PSample(fname): if fname == '-': sample = loadPlainSample(fname) else: firstLine = gOpenIn(fname, defaultEncoding).readline() if firstLine.startswith('<?xml'): sample = [ (tuple(orth), tuple(phon)) for orth, phon in loadBlissLexicon(fname) ] else: sample = loadPlainSample(fname) return sample
def templateTestMappedCounts(self, StorageClass): vocabulary = loadVocabulary('tests/nab-5k-vocabulary.txt.gz') text = misc.gOpenIn('tests/nab-mini-corpus.txt.gz') sentences = itertools.imap(str.split, text) sentences = itertools.imap(lambda s: map(vocabulary.map, s), sentences) grams = mGramsChainCount(sentences, self.order) counts = StorageClass() counts.addIter(grams) f = EqualFile('tests/nab-mini-corpus.mapped-counts.gz') TextStorage.write(f, counts) self.failUnless(f)
def templateTestMappedCounts(self, StorageClass): vocabulary = loadVocabulary('tests/nab-5k-vocabulary.txt.gz') text = misc.gOpenIn('tests/nab-mini-corpus.txt.gz') sentences = map(str.split, text) sentences = map(lambda s: list(map(vocabulary.map, s)), sentences) grams = mGramsChainCount(sentences, self.order) counts = StorageClass() counts.addIter(grams) f = EqualFile('tests/nab-mini-corpus.mapped-counts.gz') TextStorage.write(f, counts) self.assertTrue(f)
def readApplyP2G(fname, encoding = None): for line in gOpenIn(fname, encoding): line = line.rstrip() fields = line.split("\t") if len(fields) == 1: word = fields[0] left = tuple(fields[0].split()) elif len(fields) == 2: word = fields[0] left = tuple(fields[1].split()) else: print('unknown format in file: %s' % (line), file = stderr) yield word, left
def main(options, args): if options.vocabulary: vocabulary = loadVocabulary(options.vocabulary) else: vocabulary = OpenVocabulary() if options.text: text = misc.gOpenIn(options.text) sentences = itertools.imap(str.split, text) sentences = itertools.imap(lambda s: map(vocabulary.map, s), sentences) grams = mGramsChainCount(sentences, options.order - 1) counts = createStorage(options) counts.addIter(grams) elif options.read: if len(options.read) > 1: counts = createStorage(options) counts.addIter( consolidate( mergeSort([TextStorage(fname) for fname in options.read]))) else: counts = TextStorage(options.read[0]) else: print("no counts", file=sys.stderr) return if options.map_oov: if not options.vocabulary: print("you need to specify a vocabulary", file=sys.stderr) filt = MapUnknownsFilter(counts, vocabulary.list, vocabulary.unknownSymbol) mappedCounts = createStorage(options) mappedCounts.addIter(filt.rawIter()) counts = mappedCounts if options.write: countFile = misc.gOpenOut(options.write) TextStorage.write(countFile, counts) if options.counts_of_counts: coc = [ countsOfCounts(mGramReduceToOrder(counts, order)) for order in range(options.order) ] import pprint pprint.pprint(coc, misc.gOpenOut(options.counts_of_counts))
def main(options, args): builder = LanguageModelBuilder() builder.setLogFile(sys.stdout) vocabulary = loadVocabulary(options.vocabulary) builder.setVocabulary(vocabulary) builder.setHighestOrder(options.order - 1) if options.count_cutoffs: cutoffs = list(map(int, options.count_cutoffs.split())) builder.setCountCutoffs(cutoffs) binaryCountFile = options.read + '.bin' if os.path.isfile(binaryCountFile): counts = StoredCounts(binaryCountFile) else: counts = loadCounts(options.read, vocabulary, binaryCountFile) if options.counts_of_counts: coc = eval(gOpenIn(options.counts_of_counts).read()) else: coc = [ mGramCounts.countsOfCounts( mGramCounts.mGramReduceToOrder(counts, order)) for order in range(options.order) ] maximumOrder = maximumCountsOrder(coc) if builder.highestOrder > maximumOrder: print('warning: no counts for orders above %d' % (maximumOrder + 1)) builder.setHighestOrder(maximumOrder) builder.estimateDiscounts(coc) if options.lm: lm = makeLmWriter(options) else: lm = LmDummy() builder.build(counts, lm) if __debug__ and False: ### TESTING print('verifying normalization ...', file=sys.stdout) lm2 = Lm(lm) lm2.checkNormalisation()
def main(options, args): builder = LanguageModelBuilder() builder.setLogFile(sys.stdout) vocabulary = loadVocabulary(options.vocabulary) builder.setVocabulary(vocabulary) builder.setHighestOrder(options.order - 1) if options.count_cutoffs: cutoffs = map(int, options.count_cutoffs.split()) builder.setCountCutoffs(cutoffs) binaryCountFile = options.read + '.bin' if os.path.isfile(binaryCountFile): counts = StoredCounts(binaryCountFile) else: counts = loadCounts(options.read, vocabulary, binaryCountFile) if options.counts_of_counts: coc = eval(gOpenIn(options.counts_of_counts).read()) else: coc = [ mGramCounts.countsOfCounts(mGramCounts.mGramReduceToOrder(counts, order)) for order in range(options.order) ] maximumOrder = maximumCountsOrder(coc) if builder.highestOrder > maximumOrder: print 'warning: no counts for orders above %d' % (maximumOrder+1) builder.setHighestOrder(maximumOrder) builder.estimateDiscounts(coc) if options.lm: lm = makeLmWriter(options) else: lm = LmDummy() builder.build(counts, lm) if __debug__ and False: ### TESTING print >> sys.stdout, 'verifying normalization ...' lm2 = Lm(lm) lm2.checkNormalisation()
def main(options, args): if options.vocabulary: vocabulary = loadVocabulary(options.vocabulary) else: vocabulary = OpenVocabulary() if options.text: text = misc.gOpenIn(options.text) sentences = itertools.imap(str.split, text) sentences = itertools.imap(lambda s: map(vocabulary.map, s), sentences) grams = mGramsChainCount(sentences, options.order - 1) counts = createStorage(options) counts.addIter(grams) elif options.read: if len(options.read) > 1: counts = createStorage(options) counts.addIter(consolidate(mergeSort( [ TextStorage(fname) for fname in options.read ]))) else: counts = TextStorage(options.read[0]) else: print >> sys.stderr, 'no counts' return if options.map_oov: if not options.vocabulary: print >> sys.stderr, 'you need to specify a vocabulary' filt = MapUnknownsFilter(counts, vocabulary.list, vocabulary.unknownSymbol) mappedCounts = createStorage(options) mappedCounts.addIter(filt.rawIter()) counts = mappedCounts if options.write: countFile = misc.gOpenOut(options.write) TextStorage.write(countFile, counts) if options.counts_of_counts: coc = [ countsOfCounts(mGramReduceToOrder(counts, order)) for order in range(options.order) ] import pprint pprint.pprint(coc, misc.gOpenOut(options.counts_of_counts))
def readApplyP2P(fname, encoding=None): for line in gOpenIn(fname, encoding): fields = line.split() word = fields[0] left = tuple(fields[1:]) yield word, left
def readApply(fname, encoding=None): for line in gOpenIn(fname, encoding): word = line.strip() left = tuple(word) yield word, left
def readApply(fname): for line in gOpenIn(fname, defaultEncoding): word = line.strip() left = tuple(word) yield word, left
def loadVocabulary(fname): vocabulary = ClosedVocablary() vocabulary.add(['<s>', '</s>']) vocabulary.add([line.strip() for line in gOpenIn(fname)], soft=True) vocabulary.sort() return vocabulary
def readApplyP2P(fname): for line in gOpenIn(fname, defaultEncoding): fields = line.split() word = fields[0] left = tuple(fields[1:]) yield word, left
def main(options, args): # 1. load reference lexicon print('loading reference lexicon ...') lexicon = loadBlissLexicon(options.lexicon) knownWords = set([ orth for orth, phon in lexicon ]) # 2. load model for fragmentizing unknown words if options.subliminal_lexicon: print('loading subliminal lexicon ...') subliminalLexicon = loadBlissLexicon(options.subliminal_lexicon) else: subliminalLexicon = None if options.subliminal_g2p: print('loading subliminal g2p model ...') subliminalG2p = pickle.load(open(options.subliminal_g2p)) else: subliminalG2p = None if options.g2pModel: print('loading g2p model ...') model = pickle.load(open(options.g2pModel)) oldSize, newSize = model.strip() print('stripped number of multigrams from %d to %d' % (oldSize, newSize)) fragmentizer = Fragmentizer(model) if subliminalLexicon: fragmentizer.addSupervised(subliminalLexicon) if subliminalG2p: fragmentizer.addSupervised(subliminalG2p) graphones = model.sequitur.symbols() graphones.remove(model.sequitur.symbol(model.sequitur.term)) else: model = fragmentizer = graphones = None # 3. add fragments to lexicon if options.write_lexicon: print('creating extended lexicon ...') xmlLexicon = ElementTree(file = options.lexicon) if options.model_type == 'phonemes': changeSyntaticToPhonetic(xmlLexicon) else: addGraphonesToLexicon(xmlLexicon, graphones) xmlLexicon.write(gOpenOut(options.write_lexicon), defaultEncoding) # 4. determine set of LM tokens vocabulary = mGramCounts.ClosedVocablary() vocabulary.add(['<s>', '</s>']) if options.model_type == 'flat-hybrid': vocabulary.add(filter(isLmToken, knownWords), soft=True) if graphones: vocabulary.add(starmap(lmToken, graphones)) vocabulary.sort() if options.write_tokens: f = gOpenOut(options.write_tokens, defaultEncoding) if options.model_type == 'phonemes': phonemes = set(p for orth, phon in lexicon for p in phon) phonemes.add('#1') if 'si' in phonemes: phonemes.remove('si') for p in sorted(phonemes): print(p, file=f) else: for w in vocabulary: if w is not None: print(w, file=f) # 5./6. set-up LM event generator if options.write_counts or options.write_events: order = options.order - 1 if options.model_type == 'flat-hybrid': events = HybridEventGenerator(knownWords, fragmentizer, order) if options.range_type == 'fragments': events.setFragmentRange() elif options.range_type == 'words': events.setTrueWordRange() else: assert ValueError(options.range_type) elif options.model_type == 'fragments': events = OovEventGenerator(knownWords, fragmentizer, order) elif options.model_type == 'phonemes': events = PhonemeEventGenerator(lexicon, order) # 5. create modified LM training corpus counts if options.write_events: print('creating sequence model events ...') f = gOpenOut(options.write_events, defaultEncoding) for event, count in events(gOpenIn(options.text, defaultEncoding)): print(repr(event), '\t', count, file=f) # 6. count LM events if options.write_counts: print('creating sequence model counts ...') counts = mGramCounts.SimpleMultifileStorage() counts.addIter(events(gOpenIn(options.text, defaultEncoding))) mGramCounts.TextStorage.write(gOpenOut(options.write_counts, defaultEncoding), counts) # 7. dump list of OOV words and their corresponding fragmentation if options.write_fragments: print('dumping fragments ...') f = gOpenOut(options.write_fragments, defaultEncoding) events = OovFragmentGenerator(knownWords, fragmentizer) fragments = events(gOpenIn(options.text, defaultEncoding)) for event in list(fragments.keys()): print(event, '\t', ' '.join(fragments[event]), file=f) # 8. dump modified LM training text if options.write_lm_text: print('dumping modified LM training text ...') f = gOpenOut(options.write_lm_text, defaultEncoding) events = OovFragmentGenerator(knownWords, fragmentizer) for line in gOpenIn(options.text, defaultEncoding): words = line.split() modWords = events.modifyLmText(words) print(" ".join(modWords), file=f)
def main(options, args): # 1. load reference lexicon print 'loading reference lexicon ...' lexicon = loadBlissLexicon(options.lexicon) knownWords = set([ orth for orth, phon in lexicon ]) # 2. load model for fragmentizing unknown words if options.subliminal_lexicon: print 'loading subliminal lexicon ...' subliminalLexicon = loadBlissLexicon(options.subliminal_lexicon) else: subliminalLexicon = None if options.subliminal_g2p: print 'loading subliminal g2p model ...' subliminalG2p = pickle.load(open(options.subliminal_g2p)) else: subliminalG2p = None if options.g2pModel: print 'loading g2p model ...' model = pickle.load(open(options.g2pModel)) oldSize, newSize = model.strip() print 'stripped number of multigrams from %d to %d' % (oldSize, newSize) fragmentizer = Fragmentizer(model) if subliminalLexicon: fragmentizer.addSupervised(subliminalLexicon) if subliminalG2p: fragmentizer.addSupervised(subliminalG2p) graphones = model.sequitur.symbols() graphones.remove(model.sequitur.symbol(model.sequitur.term)) else: model = fragmentizer = graphones = None # 3. add fragments to lexicon if options.write_lexicon: print 'creating extended lexicon ...' xmlLexicon = ElementTree(file = options.lexicon) if options.model_type == 'phonemes': changeSyntaticToPhonetic(xmlLexicon) else: addGraphonesToLexicon(xmlLexicon, graphones) xmlLexicon.write(gOpenOut(options.write_lexicon), defaultEncoding) # 4. determine set of LM tokens vocabulary = mGramCounts.ClosedVocablary() vocabulary.add(['<s>', '</s>']) if options.model_type == 'flat-hybrid': vocabulary.add(ifilter(isLmToken, knownWords), soft=True) if graphones: vocabulary.add(starmap(lmToken, graphones)) vocabulary.sort() if options.write_tokens: f = gOpenOut(options.write_tokens, defaultEncoding) if options.model_type == 'phonemes': phonemes = set(p for orth, phon in lexicon for p in phon) phonemes.add('#1') if 'si' in phonemes: phonemes.remove('si') for p in sorted(phonemes): print >> f, p else: for w in vocabulary: if w is not None: print >> f, w # 5./6. set-up LM event generator if options.write_counts or options.write_events: order = options.order - 1 if options.model_type == 'flat-hybrid': events = HybridEventGenerator(knownWords, fragmentizer, order) if options.range_type == 'fragments': events.setFragmentRange() elif options.range_type == 'words': events.setTrueWordRange() else: assert ValueError(options.range_type) elif options.model_type == 'fragments': events = OovEventGenerator(knownWords, fragmentizer, order) elif options.model_type == 'phonemes': events = PhonemeEventGenerator(lexicon, order) # 5. create modified LM training corpus counts if options.write_events: print 'creating sequence model events ...' f = gOpenOut(options.write_events, defaultEncoding) for event, count in events(gOpenIn(options.text, defaultEncoding)): print >> f, repr(event), '\t', count # 6. count LM events if options.write_counts: print 'creating sequence model counts ...' counts = mGramCounts.SimpleMultifileStorage() counts.addIter(events(gOpenIn(options.text, defaultEncoding))) mGramCounts.TextStorage.write(gOpenOut(options.write_counts, defaultEncoding), counts) # 7. dump list of OOV words and their corresponding fragmentation if options.write_fragments: print 'dumping fragments ...' f = gOpenOut(options.write_fragments, defaultEncoding) events = OovFragmentGenerator(knownWords, fragmentizer) fragments = events(gOpenIn(options.text, defaultEncoding)) for event in fragments.keys(): print >> f, event, '\t', ' '.join(fragments[event]) # 8. dump modified LM training text if options.write_lm_text: print 'dumping modified LM training text ...' f = gOpenOut(options.write_lm_text, defaultEncoding) events = OovFragmentGenerator(knownWords, fragmentizer) for line in gOpenIn(options.text, defaultEncoding): words = line.split() modWords = events.modifyLmText(words) print >> f, " ".join(modWords)
def loadVocabulary(fname): vocabulary = ClosedVocablary() vocabulary.add(['<s>', '</s>']) vocabulary.add([ line.strip() for line in gOpenIn(fname) ], soft=True) vocabulary.sort() return vocabulary