def main(options, args): if options.phoneme_to_phoneme: loadSample = loadP2PSample else: loadSample = loadG2PSample if options.fakeTranslator: translator = MemoryTranslator(loadSample(options.fakeTranslator)) else: model = SequiturTool.procureModel(options, loadSample, log=stdout) if not model: return 1 if options.testSample or options.applySample: translator = Translator(model) if options.stack_limit: translator.setStackLimit(options.stack_limit) del model if options.testSample: mainTest(translator, loadSample(options.testSample), options) translator.reportStats(sys.stdout) if options.applySample: mainApply(translator, options) translator.reportStats(sys.stderr)
def g2pMain(options, args): import locale loadSample = loadG2PSample enc = locale.getpreferredencoding() if hasattr(sys.stdout, 'buffer'): log_stdout = codecs.getwriter(enc)(sys.stdout.buffer, errors='backslashreplace') else: log_stdout = codecs.getwriter(enc)(sys.stdout, errors='backslashreplace') if hasattr(sys.stderr, 'buffer'): log_stderr = codecs.getwriter(enc)(sys.stderr.buffer, errors='backslashreplace') else: log_stderr = codecs.getwriter(enc)(sys.stderr, errors='backslashreplace') if options.fakeTranslator: translator = MemoryTranslator(loadSample(options.fakeTranslator)) else: model = SequiturTool.procureModel(options, loadSample, log=log_stdout) if not model: return 1 if options.testSample or options.applySample or options.applyWord: translator = Translator(model) if options.stack_limit: translator.setStackLimit(options.stack_limit) del model if options.applyWord: return g2pApplyWord(translator, options, log_stdout)
def __init__(self, dict_path=__dict_path__, model_path=__model_path__): self._dict_ = dict() dict_path = os.path.expanduser(dict_path) model_path = os.path.expanduser(model_path) self.__dict_path__ = dict_path self.__model_path__ = model_path sequitur_options = Values() sequitur_options.resume_from_checkpoint = False sequitur_options.modelFile = model_path sequitur_options.shouldRampUp = False sequitur_options.trainSample = False sequitur_options.shouldTranspose = False sequitur_options.newModelFile = False sequitur_options.shouldSelfTest = False self.__model__ = SequiturTool.procureModel(sequitur_options, None) if not self.__model__: logger.error('Can\'t load g2p model.') return None self.__model__ = Translator(self.__model__) a = open(dict_path).readlines() a = [i.strip('\n') for i in a] for i in a: i = i.split(' ') self._dict_[i[0]] = i[1:]
def transliterate(model, word): class Struct: def __init__(self, **entries): self.__dict__.update(entries) model_path = { 'pythainlp_lexicon': './lib/model-7', 'wiktionary_phonemic': './lib/tha-pt-b-7' } connector_dict = { 'pythainlp_lexicon': '', 'wiktionary_phonemic': '-' } modelFile = model_path[model] connector = connector_dict[model] options = Struct(**{'profile': None, 'resource_usage': None, 'psyco': None, 'tempdir': None, 'trainSample': None, 'develSample': None, 'testSample': None, 'checkpoint': None, 'resume_from_checkpoint': None, 'shouldTranspose': None, 'modelFile': modelFile , 'newModelFile': None, 'shouldTestContinuously': None, 'shouldSelfTest': None, 'lengthConstraints': None, 'shouldSuppressNewMultigrams': None, 'viterbi': None, 'shouldRampUp': None, 'shouldWipeModel': None, 'shouldInitializeWithCounts': None, 'minIterations': 20, 'maxIterations': 100, 'eager_discount_adjustment': None, 'fixed_discount': None, 'encoding': 'UTF-8', 'phoneme_to_phoneme': None, 'test_segmental': None, 'testResult': None, 'applySample': None, 'applyWord': word, 'variants_mass': None, 'variants_number': None, 'fakeTranslator': None, 'stack_limit': None}) loadSample = g2p.loadG2PSample model = SequiturTool.procureModel(options, loadSample) if not model: return 1 translator = g2p.Translator(model) del model return connector.join(translator(tuple(word)))
def main(options, args): import locale if options.phoneme_to_phoneme: loadSample = loadP2PSample else: loadSample = loadG2PSample enc = locale.getpreferredencoding() if hasattr(sys.stdout, 'buffer'): log_stdout = codecs.getwriter(enc)(sys.stdout.buffer, errors='backslashreplace') else: log_stdout = codecs.getwriter(enc)(sys.stdout, errors='backslashreplace') if hasattr(sys.stderr, 'buffer'): log_stderr = codecs.getwriter(enc)(sys.stderr.buffer, errors='backslashreplace') else: log_stderr = codecs.getwriter(enc)(sys.stderr, errors='backslashreplace') #the encoding relates to the lexicon, not the standard IO #log_stdout = codecs.getwriter(options.encoding, errors='backslashreplace')(sys.stdout) if options.encoding else sys.stdout; #log_stderr = codecs.getwriter(options.encoding, errors='backslashreplace')(sys.stderr) if options.encoding else sys.stderr; if options.fakeTranslator: translator = MemoryTranslator(loadSample(options.fakeTranslator)) else: model = SequiturTool.procureModel(options, loadSample, log=log_stdout) if not model: return 1 if options.testSample or options.applySample or options.applyWord: translator = Translator(model) if options.stack_limit: translator.setStackLimit(options.stack_limit) del model if options.testSample: mainTest(translator, loadSample(options.testSample), options, log_stdout) translator.reportStats(log_stdout) if options.applySample: mainApply(translator, options, gOpenOut('-', options.encoding or defaultEncoding)) translator.reportStats(log_stderr) if options.applyWord: mainApplyWord(translator, options, log_stdout)
def load_g2p(model_path): sequitur_options = Values() sequitur_options.modelFile = model_path sequitur_options.resume_from_checkpoint = False sequitur_options.shouldRampUp = False sequitur_options.trainSample = False sequitur_options.shouldTranspose = False sequitur_options.shouldSelfTest = False sequitur_options.newModelFile = False model = SequiturTool.procureModel(sequitur_options, None) if not model: print('Can\'t load g2p model.') sys.exit(1) return model
def __init__(self, modelfn=SEQUITUR_MODEL): options = SeqOptionsObject() options.resume_from_checkpoint = False options.modelFile = modelfn options.shouldRampUp = False options.trainSample = None options.shouldTranspose = False options.newModelFile = None options.shouldSelfTest = False self.model = SequiturTool.procureModel(options, loadG2PSample, log=sys.stdout) self.translator = Translator(self.model)
def main(options, args): model = SequiturTool.procureModel(options, loadSample) if options.applySample: lines = gopen(options.applySample).readlines() words = Set([word for line in lines for word in line.split()]) addUnknowns(model, words) translator = Translator(model) for line in lines: left = tuple(line.split()) try: result = translator(left) print(" ".join(result)) except translator.TranslationFailure: print("<translation-failed/>")
def main(options, args): model = SequiturTool.procureModel(options, loadSample) if options.applySample: lines = gopen(options.applySample).readlines() words = Set([ word for line in lines for word in line.split() ]) addUnknowns(model, words) translator = Translator(model) for line in lines: left = tuple(line.split()) try: result = translator(left) print ' '.join(result) except translator.TranslationFailure: print '<translation-failed/>'
def __init__(self, model_path): class options(object): pass options = options() options.testSample = None options.modelFile = model_path options.trainSample = None options.encoding = 'ISO-8859-15' options.shouldInitializeWithCounts = None options.psyco = None options.stack_limit = None options.shouldTranspose = None options.applySample = 'args.txt' options.shouldRampUp = None options.resume_from_checkpoint = None options.lengthConstraints = None options.checkpoint = None options.eager_discount_adjustment = None options.fakeTranslator = None options.tempdir = None options.profile = None options.variants_number = None options.maxIterations = 100 options.testResult = None options.variants_mass = None options.shouldSuppressNewMultigrams = None options.develSample = None options.shouldWipeModel = None options.resource_usage = None options.test_segmental = None options.fixed_discount = None options.newModelFile = None options.minIterations = 20 options.shouldSelfTest = None options.viterbi = None options.shouldTestContinuously = None options.phoneme_to_phoneme = None import codecs global defaultEncoding defaultEncoding = options.encoding global stdout, stderr encoder, decoder, streamReader, streamWriter = codecs.lookup(options.encoding) stdout = streamWriter(sys.stdout) stderr = streamWriter(sys.stderr) loadSample = loadG2PSample model = SequiturTool.procureModel(options, loadSample, log=stdout) self.translator = Translator(model)
def mainTest(translator, testSample, options): if options.shouldTranspose: testSample = SequiturTool.transposeSample(testSample) if options.testResult: resultFile = gOpenOut(options.testResult, defaultEncoding) else: resultFile = None from Evaluation import Evaluator evaluator = Evaluator() evaluator.setSample(testSample) evaluator.resultFile = resultFile evaluator.verboseLog = stdout if options.test_segmental: supraSegmental = set(['.', "'", '"']) def removeSupraSegmental(phon): return filter(lambda p: p not in supraSegmental, phon) evaluator.compareFilter = removeSupraSegmental result = evaluator.evaluate(translator) print >> stdout, result
# =========================================================================== def main(options, args): model = SequiturTool.procureModel(options, loadSample) if options.applySample: lines = gopen(options.applySample).readlines() words = Set([ word for line in lines for word in line.split() ]) addUnknowns(model, words) translator = Translator(model) for line in lines: left = tuple(line.split()) try: result = translator(left) print ' '.join(result) except translator.TranslationFailure: print '<translation-failed/>' # =========================================================================== if __name__ == '__main__': import optparse, tool optparser = optparse.OptionParser( usage = '%prog [OPTION]... FILE...\n' + __doc__, version = '%prog ' + __version__) SequiturTool.addOptions(optparser) tool.addTrainOptions(optparser) optparser.add_option( '-a', '--apply', dest='applySample', help='apply translation to sentences read from FILE', metavar='FILE') options, args = optparser.parse_args() tool.run(main, options, args)
mainTest(translator, loadSample(options.testSample), options) translator.reportStats(sys.stdout) if options.applySample: mainApply(translator, options) translator.reportStats(sys.stderr) # =========================================================================== if __name__ == '__main__': import optparse, tool optparser = optparse.OptionParser(usage='%prog [OPTION]... FILE...\n' + str(__doc__), version='%prog ' + __version__) tool.addOptions(optparser) SequiturTool.addTrainOptions(optparser) optparser.add_option('-e', '--encoding', default='ISO-8859-15', help='use character set encoding ENC', metavar='ENC') optparser.add_option('-P', '--phoneme-to-phoneme', action='store_true', help='train/apply a phoneme-to-phoneme converter') optparser.add_option( '--test-segmental', action='store_true', help= 'evaluate only at segmental level, i.e. do not count syllable boundaries and stress marks' )
if options.testSample: mainTest(translator, loadSample(options.testSample), options) translator.reportStats(sys.stdout) if options.applySample: mainApply(translator, options) translator.reportStats(sys.stderr) # =========================================================================== if __name__ == '__main__': import optparse, tool optparser = optparse.OptionParser( usage = '%prog [OPTION]... FILE...\n' + __doc__, version = '%prog ' + __version__) tool.addOptions(optparser) SequiturTool.addTrainOptions(optparser) optparser.add_option( '-e', '--encoding', default='ISO-8859-15', help='use character set encoding ENC', metavar='ENC') optparser.add_option( '-P', '--phoneme-to-phoneme', action='store_true', help='train/apply a phoneme-to-phoneme converter') optparser.add_option( '--test-segmental', action='store_true', help='evaluate only at segmental level, i.e. do not count syllable boundaries and stress marks') optparser.add_option( '-B', '--result', dest='testResult', help='store test result in table FILE (for use with bootlog or R)', metavar='FILE') optparser.add_option( '-a', '--apply', dest='applySample', help='apply grapheme-to-phoneme conversion to words read from FILE', metavar='FILE')
translator = Translator(model) for line in lines: left = tuple(line.split()) try: result = translator(left) print(" ".join(result)) except translator.TranslationFailure: print("<translation-failed/>") # =========================================================================== if __name__ == "__main__": import optparse import tool optparser = optparse.OptionParser(usage="%prog [OPTION]... FILE...\n" + __doc__, version="%prog " + __version__) SequiturTool.addOptions(optparser) tool.addTrainOptions(optparser) optparser.add_option( "-a", "--apply", dest="applySample", help="apply translation to sentences read from FILE", metavar="FILE", ) options, args = optparser.parse_args() tool.run(main, options, args)
def getOptParser(): import optparse, tool optparser = optparse.OptionParser(usage='%prog [OPTION]... FILE...\n' + str(__doc__), version='%prog ' + __version__) tool.addOptions(optparser) SequiturTool.addTrainOptions(optparser) optparser.add_option('-e', '--encoding', default='ISO-8859-15', help='use character set encoding ENC', metavar='ENC') optparser.add_option('-P', '--phoneme-to-phoneme', action='store_true', help='train/apply a phoneme-to-phoneme converter') optparser.add_option( '--test-segmental', action='store_true', help= 'evaluate only at segmental level, i.e. do not count syllable boundaries and stress marks' ) optparser.add_option( '-B', '--result', dest='testResult', help='store test result in table FILE (for use with bootlog or R)', metavar='FILE') optparser.add_option( '-a', '--apply', dest='applySample', help='apply grapheme-to-phoneme conversion to words read from FILE', metavar='FILE') optparser.add_option('-w', '--word', dest='applyWord', help='apply grapheme-to-phoneme conversion to word', metavar='string') optparser.add_option( '-V', '--variants-mass', type='float', help= 'generate pronunciation variants until \sum_i p(var_i) >= Q (only effective with --apply)', metavar='Q') optparser.add_option( '--variants-number', type='int', help= 'generate up to N pronunciation variants (only effective with --apply)', metavar='N') optparser.add_option( '-f', '--fake', dest='fakeTranslator', help= 'use a translation memory (read from sample FILE) instead of a genuine model (use in combination with -x to evaluate two files against each other)', metavar='FILE') optparser.add_option('--stack-limit', type='int', help='limit size of search stack to N elements', metavar='N') return optparser