def __init__ (self, language="EN", minWordLength=4, quality=8, hyphenDir=None ): BaseHyphenator.__init__(self,language=language,minWordLength=minWordLength) if hyphenDir is None: hyphenDir = os.path.join (os.path.split(__file__)[0], "dict") # load pattern file fname = os.path.join(hyphenDir,"hyph_%s.dic"%language) # first line is set of characters, all other lines are patterns # Note: we do not use a TRIE, we just store the patterns in a dict string:codes self.quality = quality lines = open(fname).read().splitlines() self.characters = lines.pop(0) self.patterns = {} for pattern in lines: pat = "" codes = "" digit = "0" for ch in pattern: if ch>='0' and ch<='9': digit = ch else: codes = codes+digit pat = pat+ch digit = "0" codes = codes+digit self.patterns[pat.decode("iso-8859-1")] = codes
def hyphenate(self,aWord): assert isinstance(aWord, unicode_type) hword = HyphenatedWord(aWord) loesungen = self.zerlegeWort(aWord) if len(loesungen)>1: #hword.info = ("AMBIGUOUS", loesungen) # nimm nur solche Trennstellen, die in allen Lösungen vorkommen, # und für die Qualität nimm die schlechteste. loesung = [] loesung0, andere = loesungen[0], loesungen[1:] for i,hp in enumerate(loesung0): q = hp.quality for a in andere: if q: for hp1 in a: if hp1.indx==hp.indx \ and hp1.nl==hp.nl and hp1.sl==hp.sl \ and hp1.nr==hp.nr and hp1.sr==hp.sr: q = min(q,hp1.quality) break else: # Trennstelle nicht in der anderen Lösung enthalten q = 0 if q: loesung.append(HyphenationPoint(hp.indx,q,hp.nl,hp.sl,hp.nr,hp.sr)) elif len(loesungen)==1: loesung = loesungen[0] #hword.info = ("HYPHEN_OK", loesung) if not loesung: pass #hword.info = ("NOT_HYPHENATABLE", aWord) else: #hword.info = ("UNKNOWN", aWord) loesung = [] #for i in range(len(aWord)): for i in range(1,len(aWord)-1): if aWord[i] in self.postfixChars and aWord[i+1] not in "0123456789": #print "Trenne", aWord,"an Position:",i,"bei",aWord[i] # in zwei Teile zerlegen und getrennt betrachten r = self.shy if aWord[i] in [self.shy,u"-"]: r = u"" loesung1 = self.hyphenate(aWord[:i]) loesung1.hyphenations.append (HyphenationPoint(i+1,9,0,r,0,u"")) loesung2 = self.hyphenate(aWord[i+1:]) # TODO diese Lösungen müssen jetzt zusammengeführt werden. if loesung2.hyphenations == []: #nur der 1. Teil kann getrennt werden loesung = loesung1.hyphenations else: #beide Teile können getrennt werden loesung = loesung1.hyphenations + [HyphenationPoint(hp.indx+i+1,hp.quality,hp.nl,hp.sl,hp.nr,hp.sr) for hp in loesung2.hyphenations] break else: loesung = BaseHyphenator.hyphenate(self,aWord).hyphenations hword.hyphenations = loesung #print "hyphenate %s -> %d points" % (aWord,len(loesung)) return hword
def i_hyphenate_derived(self,aWord): """ You can use this method in classes derived from ExplicitHyphenator. It will first split the word using BaseHyphenator, then for each "subword" it will call ExplicitHyphenator, and only call the derived classes hyph method for the still unknown subwords. TODO: The implementation does not match the docstring test: "hohenlimburg.de", "hohenlimburg.de)" """ #print "ExplicitHyphenator.i_hyphenate_derived", aWord assert isinstance(aWord, unicode) # Helper function sub_hwords = [] hword = BaseHyphenator.i_hyphenate(self,aWord) #print "BaseHyphenator.i_hyphenate returned %r" % hword if hword is None: hword = HyphenatedWord(aWord,hyphenations=[]) base_hyph_points = hword.hyphenations last_indx = 0 nr = 0 for hpnum, hp in enumerate(base_hyph_points): if isinstance(hp, int): hp = HyphenationPoint(hp, quality=5, sl=SHY) subword = hword[last_indx+nr:hp.indx] # handle subword if SHY in subword: sub_hword = self.stripper.apply_stripped(BaseHyphenator.hyph, self, subword) else: sub_hword = self.stripper.apply_stripped(ExplicitHyphenator.hyph, self, subword) if sub_hword is None: sub_hword = self.stripper.apply_stripped(self.hyph, self, subword) if sub_hword is None: sub_hword = HyphenatedWord(subword, hyphenations=[]) sub_hwords.append(sub_hword) # end handle subword last_indx = hp.indx nr = hp.nr # Now the last subword subword = hword[last_indx:] # handle subword if SHY in subword: sub_hword = self.stripper.apply_stripped(BaseHyphenator.hyph, self, subword) else: sub_hword = self.stripper.apply_stripped(ExplicitHyphenator.hyph, self, subword) if sub_hword is None: sub_hword = self.stripper.apply_stripped(self.hyph, self, subword) if sub_hword is None: sub_hword = HyphenatedWord(subword, hyphenations=[]) sub_hwords.append(sub_hword) #end handle subword if len(sub_hwords) > 1: return HyphenatedWord.join(sub_hwords) else: return sub_hwords[0] # Kann auch None sein.
def hyphenate(self,aWord): assert isinstance(aWord, unicode) hword = HyphenatedWord(aWord) loesungen = self.zerlegeWort(aWord) if len(loesungen)>1: #hword.info = ("AMBIGUOUS", loesungen) # nimm nur solche Trennstellen, die in allen Lösungen vorkommen, # und für die Qualität nimm die schlechteste. loesung = [] loesung0, andere = loesungen[0], loesungen[1:] for i,hp in enumerate(loesung0): q = hp.quality for a in andere: if q: for hp1 in a: if hp1.indx==hp.indx \ and hp1.nl==hp.nl and hp1.sl==hp.sl \ and hp1.nr==hp.nr and hp1.sr==hp.sr: q = min(q,hp1.quality) break else: # Trennstelle nicht in der anderen Lösung enthalten q = 0 if q: loesung.append(HyphenationPoint(hp.indx,q,hp.nl,hp.sl,hp.nr,hp.sr)) elif len(loesungen)==1: loesung = loesungen[0] #hword.info = ("HYPHEN_OK", loesung) if not loesung: pass #hword.info = ("NOT_HYPHENATABLE", aWord) else: #hword.info = ("UNKNOWN", aWord) loesung = [] #for i in range(len(aWord)): for i in range(1,len(aWord)-1): if aWord[i] in self.postfixChars and aWord[i+1] not in "0123456789": #print "Trenne", aWord,"an Position:",i,"bei",aWord[i] # in zwei Teile zerlegen und getrennt betrachten r = self.shy if aWord[i] in [self.shy,u"-"]: r = u"" loesung1 = self.hyphenate(aWord[:i]) loesung1.hyphenations.append (HyphenationPoint(i+1,9,0,r,0,u"")) loesung2 = self.hyphenate(aWord[i+1:]) # TODO diese Lösungen müssen jetzt zusammengeführt werden. if loesung2.hyphenations == []: #nur der 1. Teil kann getrennt werden loesung = loesung1.hyphenations else: #beide Teile können getrennt werden loesung = loesung1.hyphenations + [HyphenationPoint(hp.indx+i+1,hp.quality,hp.nl,hp.sl,hp.nr,hp.sr) for hp in loesung2.hyphenations] break else: loesung = BaseHyphenator.hyphenate(self,aWord).hyphenations hword.hyphenations = loesung #print "hyphenate %s -> %d points" % (aWord,len(loesung)) return hword
def i_hyphenate_derived(self,aWord): """ You can use this method in classes derived from ExplicitHyphenator. It will first split the word using BaseHyphenator, then for each "subword" it will call ExplicitHyphenator, and only call the derived classes hyph method for the still unknown subwords. TODO: The implementation does not match the docstring test: "hohenlimburg.de", "hohenlimburg.de)" """ #print "ExplicitHyphenator.i_hyphenate_derived", aWord assert isinstance(aWord, unicode_type) # Helper function sub_hwords = [] hword = BaseHyphenator.i_hyphenate(self,aWord) #print "BaseHyphenator.i_hyphenate returned %r" % hword if hword is None: hword = HyphenatedWord(aWord,hyphenations=[]) base_hyph_points = hword.hyphenations last_indx = 0 nr = 0 for hpnum, hp in enumerate(base_hyph_points): if isinstance(hp, int): hp = HyphenationPoint(hp, quality=5, sl=SHY) subword = hword[last_indx+nr:hp.indx] # handle subword if SHY in subword: sub_hword = self.stripper.apply_stripped(BaseHyphenator.hyph, self, subword) else: sub_hword = self.stripper.apply_stripped(ExplicitHyphenator.hyph, self, subword) if sub_hword is None: sub_hword = self.stripper.apply_stripped(self.hyph, self, subword) if sub_hword is None: sub_hword = HyphenatedWord(subword, hyphenations=[]) sub_hwords.append(sub_hword) # end handle subword last_indx = hp.indx nr = hp.nr # Now the last subword subword = hword[last_indx:] # handle subword if SHY in subword: sub_hword = self.stripper.apply_stripped(BaseHyphenator.hyph, self, subword) else: sub_hword = self.stripper.apply_stripped(ExplicitHyphenator.hyph, self, subword) if sub_hword is None: sub_hword = self.stripper.apply_stripped(self.hyph, self, subword) if sub_hword is None: sub_hword = HyphenatedWord(subword, hyphenations=[]) sub_hwords.append(sub_hword) #end handle subword if len(sub_hwords) > 1: return HyphenatedWord.join(sub_hwords) else: return sub_hwords[0] # Kann auch None sein.
def __init__ (self, language="DE", minWordLength=4, qHaupt=8, qNeben=5, qVorsilbe=5, qSchlecht=3, hyphenDir=None, **options ): BaseHyphenator.__init__(self,language=language,minWordLength=minWordLength,**options) # Qualitäten für verschiedene Trennstellen self.qHaupt=qHaupt self.qNeben=qNeben self.qVorsilbe=qVorsilbe self.qSchlecht=qSchlecht # Stammdaten initialisieren self.sonderfaelle = []