Ejemplo n.º 1
0
class Fragmentizer:
    def __init__(self, model):
        self.model = model
        self.translator = Translator(self.model)
        self.memory = dict()

    def addSupervised(self, lexicon=None):
        """
        Caveat: supervised splitting might come up with graphones that
        are NOT present in the model g2p, because they were trimmed!
        Therefore this function may modify the sequitur inventory.
        """
        segmenter = Segmenter(self.model)
        fragments = set()
        for orth, phon in lexicon:
            logLik, joint = segmenter.firstBestJoint(orth, phon)
            for fragment in joint:
                fragments.add(fragment)
            joint = [lmToken(gra, pho) for gra, pho in joint]
            if orth not in self.memory:
                self.memory[orth] = []
            self.memory[orth].append(joint)

        oldSize, newSize = self.model.strip()
        print("stripped number of multigrams from %d to %d" % (oldSize, newSize))

        sequitur = self.model.sequitur
        for gra, pho in fragments:
            fragment = (
                sequitur.leftInventory.parse(gra),
                sequitur.rightInventory.parse(pho),
            )
            sequitur.inventory.index(fragment)
        self.translator.setModel(self.model)

    def __call__(self, word):
        translations = []
        if word in self.memory:
            translations = self.memory[word]
        else:
            try:
                logLik, joint = self.translator.firstBestJoint(word)
                joint = [lmToken(gra, pho) for gra, pho in joint]
                translations.append(joint)
            except Translator.TranslationFailure:
                print('failed to represent "%s" using graphones' % word)
                translations.append([word + "[UNKNOWN]"])
        return translations
Ejemplo n.º 2
0
def main(options, args):
    if options.phoneme_to_phoneme:
        loadSample = loadP2PSample
    else:
        loadSample = loadG2PSample

    if options.fakeTranslator:
        translator = MemoryTranslator(loadSample(options.fakeTranslator))
    else:
        model = SequiturTool.procureModel(options, loadSample, log=stdout)
        if not model:
            return 1
        if options.testSample or options.applySample:
            translator = Translator(model)
            if options.stack_limit:
                translator.setStackLimit(options.stack_limit)
        del model

    if options.testSample:
        mainTest(translator, loadSample(options.testSample), options)
        translator.reportStats(sys.stdout)

    if options.applySample:
        mainApply(translator, options)
        translator.reportStats(sys.stderr)
Ejemplo n.º 3
0
def g2pMain(options, args):
    import locale
    loadSample = loadG2PSample

    enc = locale.getpreferredencoding()
    if hasattr(sys.stdout, 'buffer'):
        log_stdout = codecs.getwriter(enc)(sys.stdout.buffer,
                                           errors='backslashreplace')
    else:
        log_stdout = codecs.getwriter(enc)(sys.stdout,
                                           errors='backslashreplace')

    if hasattr(sys.stderr, 'buffer'):
        log_stderr = codecs.getwriter(enc)(sys.stderr.buffer,
                                           errors='backslashreplace')
    else:
        log_stderr = codecs.getwriter(enc)(sys.stderr,
                                           errors='backslashreplace')

    if options.fakeTranslator:
        translator = MemoryTranslator(loadSample(options.fakeTranslator))
    else:
        model = SequiturTool.procureModel(options, loadSample, log=log_stdout)
        if not model:
            return 1
        if options.testSample or options.applySample or options.applyWord:
            translator = Translator(model)
            if options.stack_limit:
                translator.setStackLimit(options.stack_limit)
        del model

    if options.applyWord:
        return g2pApplyWord(translator, options, log_stdout)
Ejemplo n.º 4
0
    def __init__(self, dict_path=__dict_path__, model_path=__model_path__):
        self._dict_ = dict()
        dict_path = os.path.expanduser(dict_path)
        model_path = os.path.expanduser(model_path)
        self.__dict_path__ = dict_path
        self.__model_path__ = model_path

        sequitur_options = Values()
        sequitur_options.resume_from_checkpoint = False
        sequitur_options.modelFile = model_path
        sequitur_options.shouldRampUp = False
        sequitur_options.trainSample = False
        sequitur_options.shouldTranspose = False
        sequitur_options.newModelFile = False
        sequitur_options.shouldSelfTest = False
        self.__model__ = SequiturTool.procureModel(sequitur_options, None)
        if not self.__model__:
            logger.error('Can\'t load g2p model.')
            return None
        self.__model__ = Translator(self.__model__)

        a = open(dict_path).readlines()
        a = [i.strip('\n') for i in a]
        for i in a:
            i = i.split(' ')
            self._dict_[i[0]] = i[1:]
Ejemplo n.º 5
0
class Fragmentizer:
    def __init__(self, model):
	self.model = model
	self.translator = Translator(self.model)
	self.memory = dict()

    def addSupervised(self, lexicon=None):
	"""
	Caveat: supervised splitting might come up with graphones that
	are NOT present in the model g2p, because they were trimmed!
	Therefore this function may modify the sequitur inventory.
	"""
	segmenter = Segmenter(self.model)
	fragments = set()
	for orth, phon in lexicon:
	    logLik, joint = segmenter.firstBestJoint(orth, phon)
	    for fragment in joint:
		fragments.add(fragment)
	    joint = [ lmToken(gra, pho) for gra, pho in joint ]
	    if orth not in self.memory: self.memory[orth] = []
	    self.memory[orth].append(joint)

	oldSize, newSize = self.model.strip()
	print 'stripped number of multigrams from %d to %d' % (oldSize, newSize)

	sequitur = self.model.sequitur
	for gra, pho in fragments:
	    fragment = ( sequitur.leftInventory.parse(gra),
			 sequitur.rightInventory.parse(pho) )
	    sequitur.inventory.index(fragment)
	self.translator.setModel(self.model)

    def __call__(self, word):
	translations = []
	if word in self.memory:
	    translations = self.memory[word]
	else:
	    try:
		logLik, joint = self.translator.firstBestJoint(word)
		joint = [ lmToken(gra, pho) for gra, pho in joint ]
		translations.append(joint)
	    except Translator.TranslationFailure:
		print 'failed to represent "%s" using graphones' % word
		translations.append([word+'[UNKNOWN]'])
	return translations
Ejemplo n.º 6
0
    def procureModel(self):
        if self.options.resume_from_checkpoint:
            model = ModelTemplate.resume(self.options.resume_from_checkpoint)
            self.sequitur = model.sequitur
        elif self.options.modelFile:
            if sys.version_info[:2] >= (3, 0):
                model = pickle.load(open(self.options.modelFile, 'rb'), encoding='latin1')
            else:
                try:
                    model = pickle.load(open(self.options.modelFile, 'rb'))
                except ValueError:
                    print('This error most likely occured because the loaded model was created in python3.\n', file=sys.stderr)
                    raise
                
            self.sequitur = model.sequitur
        else:
            self.sequitur = Sequitur()
            model = None

        if self.options.shouldRampUp:
            model.rampUp()

        if self.options.trainSample:
            model = self.trainModel(model)
            if not model:
                print('failed to estimate or load model', file=self.log)
                return

        if not model:
            raise UsageError

#       model.sequenceModel.showMostProbable(sys.stdout, model.sequitur.symbol, limit=250)

        if self.options.shouldTranspose:
            model.transpose()

        if self.options.newModelFile:
            oldSize, newSize = model.strip()
            print('stripped number of multigrams from %d to %d' % (oldSize, newSize), file=self.log)
            f = open(self.options.newModelFile, 'wb')
            pickle.dump(model, f, pickle.HIGHEST_PROTOCOL)
            f.close()
            del f

        if self.options.shouldSelfTest:
            print('warning: --self-test does not treat pronunciation variants correctly', file=self.log)
            if not self.develSample:
                print('error: cannot do --self-test without --devel sample', file=self.log)
            else:
                translator = Translator(model)
                evaluator = Evaluator()
                evaluator.setSample(self.develSample)
                evaluator.verboseLog = self.log
                result = evaluator.evaluate(translator)
                print(result, file=self.log)

        return model
Ejemplo n.º 7
0
    def procureModel(self):
        if self.options.resume_from_checkpoint:
            model = ModelTemplate.resume(self.options.resume_from_checkpoint)
            self.sequitur = model.sequitur
        elif self.options.modelFile:
            model = pickle.load(open(self.options.modelFile, "rb"))
            self.sequitur = model.sequitur
        else:
            self.sequitur = Sequitur()
            model = None

        if self.options.shouldRampUp:
            model.rampUp()

        if self.options.trainSample:
            model = self.trainModel(model)
            if not model:
                print('failed to estimate or load model', file=self.log)
                return

        if not model:
            raise UsageError


#       model.sequenceModel.showMostProbable(sys.stdout, model.sequitur.symbol, limit=250)

        if self.options.shouldTranspose:
            model.transpose()

        if self.options.newModelFile:
            oldSize, newSize = model.strip()
            print('stripped number of multigrams from %d to %d' %
                  (oldSize, newSize),
                  file=self.log)
            f = open(self.options.newModelFile, 'wb')
            pickle.dump(model, f)
            f.close()
            del f

        if self.options.shouldSelfTest:
            print(
                'warning: --self-test does not treat pronunciation variants correctly',
                file=self.log)
            if not self.develSample:
                print('error: cannot do --self-test without --devel sample',
                      file=self.log)
            else:
                translator = Translator(model)
                evaluator = Evaluator()
                evaluator.setSample(self.develSample)
                evaluator.verboseLog = self.log
                result = evaluator.evaluate(translator)
                print(result, file=self.log)

        return model
Ejemplo n.º 8
0
    def __init__(self, modelfn=SEQUITUR_MODEL):

        options = SeqOptionsObject()
        options.resume_from_checkpoint = False
        options.modelFile              = modelfn
        options.shouldRampUp           = False
        options.trainSample            = None
        options.shouldTranspose        = False
        options.newModelFile           = None
        options.shouldSelfTest         = False

        self.model = SequiturTool.procureModel(options, loadG2PSample, log=sys.stdout)

        self.translator = Translator(self.model)
Ejemplo n.º 9
0
def main(options, args):
    model = SequiturTool.procureModel(options, loadSample)
    if options.applySample:
        lines = gopen(options.applySample).readlines()
        words = Set([word for line in lines for word in line.split()])
        addUnknowns(model, words)
        translator = Translator(model)
        for line in lines:
            left = tuple(line.split())
            try:
                result = translator(left)
                print(" ".join(result))
            except translator.TranslationFailure:
                print("<translation-failed/>")
Ejemplo n.º 10
0
def main(options, args):
    import locale
    if options.phoneme_to_phoneme:
        loadSample = loadP2PSample
    else:
        loadSample = loadG2PSample

    enc = locale.getpreferredencoding()
    if hasattr(sys.stdout, 'buffer'):
        log_stdout = codecs.getwriter(enc)(sys.stdout.buffer,
                                           errors='backslashreplace')
    else:
        log_stdout = codecs.getwriter(enc)(sys.stdout,
                                           errors='backslashreplace')

    if hasattr(sys.stderr, 'buffer'):
        log_stderr = codecs.getwriter(enc)(sys.stderr.buffer,
                                           errors='backslashreplace')
    else:
        log_stderr = codecs.getwriter(enc)(sys.stderr,
                                           errors='backslashreplace')

    #the encoding relates to the lexicon, not the standard IO
    #log_stdout = codecs.getwriter(options.encoding, errors='backslashreplace')(sys.stdout) if options.encoding else sys.stdout;
    #log_stderr = codecs.getwriter(options.encoding, errors='backslashreplace')(sys.stderr) if options.encoding else sys.stderr;

    if options.fakeTranslator:
        translator = MemoryTranslator(loadSample(options.fakeTranslator))
    else:
        model = SequiturTool.procureModel(options, loadSample, log=log_stdout)
        if not model:
            return 1
        if options.testSample or options.applySample or options.applyWord:
            translator = Translator(model)
            if options.stack_limit:
                translator.setStackLimit(options.stack_limit)
        del model

    if options.testSample:
        mainTest(translator, loadSample(options.testSample), options,
                 log_stdout)
        translator.reportStats(log_stdout)

    if options.applySample:
        mainApply(translator, options,
                  gOpenOut('-', options.encoding or defaultEncoding))
        translator.reportStats(log_stderr)

    if options.applyWord:
        mainApplyWord(translator, options, log_stdout)
Ejemplo n.º 11
0
 def __init__(self, model_path):
     class options(object):
         pass  
     options = options()
     options.testSample = None
     options.modelFile = model_path  
     options.trainSample = None  
     options.encoding = 'ISO-8859-15'  
     options.shouldInitializeWithCounts = None  
     options.psyco = None  
     options.stack_limit = None  
     options.shouldTranspose = None  
     options.applySample = 'args.txt'  
     options.shouldRampUp = None  
     options.resume_from_checkpoint = None  
     options.lengthConstraints = None  
     options.checkpoint = None  
     options.eager_discount_adjustment = None  
     options.fakeTranslator = None  
     options.tempdir = None  
     options.profile = None  
     options.variants_number = None  
     options.maxIterations = 100  
     options.testResult = None  
     options.variants_mass = None  
     options.shouldSuppressNewMultigrams = None  
     options.develSample = None  
     options.shouldWipeModel = None  
     options.resource_usage = None  
     options.test_segmental = None  
     options.fixed_discount = None  
     options.newModelFile = None  
     options.minIterations = 20  
     options.shouldSelfTest = None  
     options.viterbi = None  
     options.shouldTestContinuously = None  
     options.phoneme_to_phoneme = None
     
     import codecs
     global defaultEncoding
     defaultEncoding = options.encoding
     global stdout, stderr
     encoder, decoder, streamReader, streamWriter = codecs.lookup(options.encoding)
     stdout = streamWriter(sys.stdout)
     stderr = streamWriter(sys.stderr)
     loadSample = loadG2PSample
     model = SequiturTool.procureModel(options, loadSample, log=stdout)
     self.translator = Translator(model)
Ejemplo n.º 12
0
def translate(text):
    text = text.replace(",", " ,")
    text = text.replace(".", " .")
    text = text.replace("?", " ?")
    text = text.replace(":", " .")
    text = text.replace("\"", "")

    translator = Translator(g2p)
    phone = []
    for w in text.split(" "):
        try:
            if w in [".", ",", "?"]:
                phone.append("sp")
            if w == "<sp>":
                phone.append("sp")
            else:
                phones = translator(w.lower())
                phone.extend(phones)
            phone.append(" ")
        except Translator.TranslationFailure:
            pass
    return phone
Ejemplo n.º 13
0
    def __init__(self, model):
	self.model = model
	self.translator = Translator(self.model)
	self.memory = dict()
Ejemplo n.º 14
0
    def __init__(self, model):
	self.model = model
	self.translator = Translator(self.model)
	self.memory = dict()
Ejemplo n.º 15
0
 def __setstate__(self, d):
     self.__dict__ = d
     self.translator = Translator(self.model)
Ejemplo n.º 16
0
    try:
        return m[s]
    except:
        return s

if __name__ == "__main__":
    chardictfn = sys.argv[1]
    datadir = sys.argv[2]
    altlangtags = sys.argv[3].split(",")

    with codecs.open(chardictfn, encoding="utf-8") as infh:
        chardict = dict([(line.split()[0], line.split()[1:]) for line in infh if line.strip() != ""])
    translators = {}
    phonemaps = {}
    with open(os.path.join(datadir, "g2p.model.pickle")) as infh:
        translators[""] = Translator(pickle.load(infh))
    for altlangtag in altlangtags:
        with open(os.path.join(datadir, "g2p.model."+altlangtag+".pickle")) as infh:
            translators[altlangtag] = Translator(pickle.load(infh))
        with open(os.path.join(datadir, "g2p.phonemap."+altlangtag+".tsv")) as infh:
            fields = [line.strip().split("\t") for line in infh if line.strip()]
            phonemaps[altlangtag] = dict(fields)
                
    for line in sys.stdin:
        line = unicode(line, encoding="utf-8").strip()
        word = line.split("<")[0]
        try:
            pronun = chardict[word]
        except KeyError:
            try:
                pronun = None
Ejemplo n.º 17
0
except ImportError:
    import pickle
    
from sequitur import Translator

UNK_WORD = "<unk>" #DEMIT: centralize this at some stage

if __name__ == "__main__":
    chardictfn = sys.argv[1]
    g2pmodelfn = sys.argv[2]

    with codecs.open(chardictfn, encoding="utf-8") as infh:
        chardict = dict([(line.split()[0], line.split()[1:]) for line in infh if line.strip() != ""])
    with open(g2pmodelfn) as infh:
        g2pmodel = pickle.load(infh)
    translator = Translator(g2pmodel)
    
    for line in sys.stdin:
        line = unicode(line, encoding="utf-8").strip()
        word = line.split("<")[0]
        try:
            pronun = chardict[word]
        except KeyError:
            try:
                pronun = translator(word)
                if not pronun:
                    pronun = chardict[UNK_WORD]
            except BaseException as e:
                print("FAILED WORD:", word.encode("utf-8"), file=sys.stderr)
                pronun = chardict[UNK_WORD]
                      
Ejemplo n.º 18
0
    def procureModel(self):
        #print self.options,type(self.options)
        #print self.loadSample,type(self.loadSample)
        #print self.log,type(self.log)

        if self.options.resume_from_checkpoint:
            model = ModelTemplate.resume(self.options.resume_from_checkpoint)
            self.sequitur = model.sequitur
        elif self.options.modelFile:
            #print "loading",self.options.modelFile
            f = open(self.options.modelFile)

            #print "loaded",f
            #print "type:",type(f)
            #print pickle
            class Model(object):
                pass

            model = pickle.load(f)
            #print "loaded",self.options.modelFile
            self.sequitur = model.sequitur
        else:
            self.sequitur = Sequitur()
            model = None

        if self.options.shouldRampUp:
            model.rampUp()

        if self.options.trainSample:
            model = self.trainModel(model)
            if not model:
                print >> self.log, 'failed to estimate or load model'
                return

        if not model:
            raise UsageError


#       model.sequenceModel.showMostProbable(sys.stdout, model.sequitur.symbol, limit=250)

        if self.options.shouldTranspose:
            model.transpose()

        if self.options.newModelFile:
            oldSize, newSize = model.strip()
            print >> self.log, 'stripped number of multigrams from %d to %d' % (
                oldSize, newSize)
            f = open(self.options.newModelFile, 'w')
            pickle.dump(model, f, pickle.HIGHEST_PROTOCOL)
            f.close()
            del f

        if self.options.shouldSelfTest:
            print >> self.log, 'warning: --self-test does not treat pronunciation variants correctly'
            if not self.develSample:
                print >> self.log, 'error: cannot do --self-test without --devel sample'
            else:
                translator = Translator(model)
                evaluator = Evaluator()
                evaluator.setSample(self.develSample)
                evaluator.verboseLog = self.log
                result = evaluator.evaluate(translator)
                print >> self.log, result

        return model
Ejemplo n.º 19
0
 def __call__(self, log, context, model):
     translator = Translator(model)
     result = self.evaluator.evaluate(translator)
     print('ER %s: string errors %s    symbol errors %s' %
           (self.name, result.stringError, result.symbolError),
           file=log)
Ejemplo n.º 20
0
 def __init__(self, jsmmodel, graphtranstable):
     self.gmap = graphtranstable
     self.model = jsmmodel
     self.translator = Translator(self.model)