def main(options, args): if options.phoneme_to_phoneme: loadSample = loadP2PSample else: loadSample = loadG2PSample if options.fakeTranslator: translator = MemoryTranslator(loadSample(options.fakeTranslator)) else: model = SequiturTool.procureModel(options, loadSample, log=stdout) if not model: return 1 if options.testSample or options.applySample: translator = Translator(model) if options.stack_limit: translator.setStackLimit(options.stack_limit) del model if options.testSample: mainTest(translator, loadSample(options.testSample), options) translator.reportStats(sys.stdout) if options.applySample: mainApply(translator, options) translator.reportStats(sys.stderr)
def __init__(self, dict_path=__dict_path__, model_path=__model_path__): self._dict_ = dict() dict_path = os.path.expanduser(dict_path) model_path = os.path.expanduser(model_path) self.__dict_path__ = dict_path self.__model_path__ = model_path sequitur_options = Values() sequitur_options.resume_from_checkpoint = False sequitur_options.modelFile = model_path sequitur_options.shouldRampUp = False sequitur_options.trainSample = False sequitur_options.shouldTranspose = False sequitur_options.newModelFile = False sequitur_options.shouldSelfTest = False self.__model__ = SequiturTool.procureModel(sequitur_options, None) if not self.__model__: logger.error('Can\'t load g2p model.') return None self.__model__ = Translator(self.__model__) a = open(dict_path).readlines() a = [i.strip('\n') for i in a] for i in a: i = i.split(' ') self._dict_[i[0]] = i[1:]
def g2pMain(options, args): import locale loadSample = loadG2PSample enc = locale.getpreferredencoding() if hasattr(sys.stdout, 'buffer'): log_stdout = codecs.getwriter(enc)(sys.stdout.buffer, errors='backslashreplace') else: log_stdout = codecs.getwriter(enc)(sys.stdout, errors='backslashreplace') if hasattr(sys.stderr, 'buffer'): log_stderr = codecs.getwriter(enc)(sys.stderr.buffer, errors='backslashreplace') else: log_stderr = codecs.getwriter(enc)(sys.stderr, errors='backslashreplace') if options.fakeTranslator: translator = MemoryTranslator(loadSample(options.fakeTranslator)) else: model = SequiturTool.procureModel(options, loadSample, log=log_stdout) if not model: return 1 if options.testSample or options.applySample or options.applyWord: translator = Translator(model) if options.stack_limit: translator.setStackLimit(options.stack_limit) del model if options.applyWord: return g2pApplyWord(translator, options, log_stdout)
def transliterate(model, word): class Struct: def __init__(self, **entries): self.__dict__.update(entries) model_path = { 'pythainlp_lexicon': './lib/model-7', 'wiktionary_phonemic': './lib/tha-pt-b-7' } connector_dict = { 'pythainlp_lexicon': '', 'wiktionary_phonemic': '-' } modelFile = model_path[model] connector = connector_dict[model] options = Struct(**{'profile': None, 'resource_usage': None, 'psyco': None, 'tempdir': None, 'trainSample': None, 'develSample': None, 'testSample': None, 'checkpoint': None, 'resume_from_checkpoint': None, 'shouldTranspose': None, 'modelFile': modelFile , 'newModelFile': None, 'shouldTestContinuously': None, 'shouldSelfTest': None, 'lengthConstraints': None, 'shouldSuppressNewMultigrams': None, 'viterbi': None, 'shouldRampUp': None, 'shouldWipeModel': None, 'shouldInitializeWithCounts': None, 'minIterations': 20, 'maxIterations': 100, 'eager_discount_adjustment': None, 'fixed_discount': None, 'encoding': 'UTF-8', 'phoneme_to_phoneme': None, 'test_segmental': None, 'testResult': None, 'applySample': None, 'applyWord': word, 'variants_mass': None, 'variants_number': None, 'fakeTranslator': None, 'stack_limit': None}) loadSample = g2p.loadG2PSample model = SequiturTool.procureModel(options, loadSample) if not model: return 1 translator = g2p.Translator(model) del model return connector.join(translator(tuple(word)))
def main(options, args): model = SequiturTool.procureModel(options, loadSample) if options.applySample: lines = gopen(options.applySample).readlines() words = Set([word for line in lines for word in line.split()]) addUnknowns(model, words) translator = Translator(model) for line in lines: left = tuple(line.split()) try: result = translator(left) print(" ".join(result)) except translator.TranslationFailure: print("<translation-failed/>")
def main(options, args): model = SequiturTool.procureModel(options, loadSample) if options.applySample: lines = gopen(options.applySample).readlines() words = Set([ word for line in lines for word in line.split() ]) addUnknowns(model, words) translator = Translator(model) for line in lines: left = tuple(line.split()) try: result = translator(left) print ' '.join(result) except translator.TranslationFailure: print '<translation-failed/>'
def main(options, args): import locale if options.phoneme_to_phoneme: loadSample = loadP2PSample else: loadSample = loadG2PSample enc = locale.getpreferredencoding() if hasattr(sys.stdout, 'buffer'): log_stdout = codecs.getwriter(enc)(sys.stdout.buffer, errors='backslashreplace') else: log_stdout = codecs.getwriter(enc)(sys.stdout, errors='backslashreplace') if hasattr(sys.stderr, 'buffer'): log_stderr = codecs.getwriter(enc)(sys.stderr.buffer, errors='backslashreplace') else: log_stderr = codecs.getwriter(enc)(sys.stderr, errors='backslashreplace') #the encoding relates to the lexicon, not the standard IO #log_stdout = codecs.getwriter(options.encoding, errors='backslashreplace')(sys.stdout) if options.encoding else sys.stdout; #log_stderr = codecs.getwriter(options.encoding, errors='backslashreplace')(sys.stderr) if options.encoding else sys.stderr; if options.fakeTranslator: translator = MemoryTranslator(loadSample(options.fakeTranslator)) else: model = SequiturTool.procureModel(options, loadSample, log=log_stdout) if not model: return 1 if options.testSample or options.applySample or options.applyWord: translator = Translator(model) if options.stack_limit: translator.setStackLimit(options.stack_limit) del model if options.testSample: mainTest(translator, loadSample(options.testSample), options, log_stdout) translator.reportStats(log_stdout) if options.applySample: mainApply(translator, options, gOpenOut('-', options.encoding or defaultEncoding)) translator.reportStats(log_stderr) if options.applyWord: mainApplyWord(translator, options, log_stdout)
def load_g2p(model_path): sequitur_options = Values() sequitur_options.modelFile = model_path sequitur_options.resume_from_checkpoint = False sequitur_options.shouldRampUp = False sequitur_options.trainSample = False sequitur_options.shouldTranspose = False sequitur_options.shouldSelfTest = False sequitur_options.newModelFile = False model = SequiturTool.procureModel(sequitur_options, None) if not model: print('Can\'t load g2p model.') sys.exit(1) return model
def __init__(self, modelfn=SEQUITUR_MODEL): options = SeqOptionsObject() options.resume_from_checkpoint = False options.modelFile = modelfn options.shouldRampUp = False options.trainSample = None options.shouldTranspose = False options.newModelFile = None options.shouldSelfTest = False self.model = SequiturTool.procureModel(options, loadG2PSample, log=sys.stdout) self.translator = Translator(self.model)
def __init__(self, model_path): class options(object): pass options = options() options.testSample = None options.modelFile = model_path options.trainSample = None options.encoding = 'ISO-8859-15' options.shouldInitializeWithCounts = None options.psyco = None options.stack_limit = None options.shouldTranspose = None options.applySample = 'args.txt' options.shouldRampUp = None options.resume_from_checkpoint = None options.lengthConstraints = None options.checkpoint = None options.eager_discount_adjustment = None options.fakeTranslator = None options.tempdir = None options.profile = None options.variants_number = None options.maxIterations = 100 options.testResult = None options.variants_mass = None options.shouldSuppressNewMultigrams = None options.develSample = None options.shouldWipeModel = None options.resource_usage = None options.test_segmental = None options.fixed_discount = None options.newModelFile = None options.minIterations = 20 options.shouldSelfTest = None options.viterbi = None options.shouldTestContinuously = None options.phoneme_to_phoneme = None import codecs global defaultEncoding defaultEncoding = options.encoding global stdout, stderr encoder, decoder, streamReader, streamWriter = codecs.lookup(options.encoding) stdout = streamWriter(sys.stdout) stderr = streamWriter(sys.stderr) loadSample = loadG2PSample model = SequiturTool.procureModel(options, loadSample, log=stdout) self.translator = Translator(model)