Ejemplo n.º 1
0
 def __init__ (self, 
               language="EN",
               minWordLength=4,
               quality=8,
               hyphenDir=None
              ):
     BaseHyphenator.__init__(self,language=language,minWordLength=minWordLength)
     if hyphenDir is None:
         hyphenDir = os.path.join (os.path.split(__file__)[0], "dict")
     # load pattern file
     fname = os.path.join(hyphenDir,"hyph_%s.dic"%language)
     # first line is set of characters, all other lines are patterns
     # Note: we do not use a TRIE, we just store the patterns in a dict string:codes
     self.quality = quality
     lines = open(fname).read().splitlines()
     self.characters = lines.pop(0)
     self.patterns = {}
     for pattern in lines:
         pat = ""
         codes = ""
         digit = "0"
         for ch in pattern:
             if ch>='0' and ch<='9':
                 digit = ch
             else:
                 codes = codes+digit
                 pat = pat+ch
                 digit = "0"
         codes = codes+digit
         self.patterns[pat.decode("iso-8859-1")] = codes
Ejemplo n.º 2
0
 def __init__ (self, 
               language="EN",
               minWordLength=4,
               quality=8,
               hyphenDir=None
              ):
     BaseHyphenator.__init__(self,language=language,minWordLength=minWordLength)
     if hyphenDir is None:
         hyphenDir = os.path.join (os.path.split(__file__)[0], "dict")
     # load pattern file
     fname = os.path.join(hyphenDir,"hyph_%s.dic"%language)
     # first line is set of characters, all other lines are patterns
     # Note: we do not use a TRIE, we just store the patterns in a dict string:codes
     self.quality = quality
     lines = open(fname).read().splitlines()
     self.characters = lines.pop(0)
     self.patterns = {}
     for pattern in lines:
         pat = ""
         codes = ""
         digit = "0"
         for ch in pattern:
             if ch>='0' and ch<='9':
                 digit = ch
             else:
                 codes = codes+digit
                 pat = pat+ch
                 digit = "0"
         codes = codes+digit
         self.patterns[pat.decode("iso-8859-1")] = codes
Ejemplo n.º 3
0
    def hyphenate(self,aWord):
        assert isinstance(aWord, unicode_type)
        hword = HyphenatedWord(aWord)
        loesungen = self.zerlegeWort(aWord)
        if len(loesungen)>1:
            #hword.info = ("AMBIGUOUS", loesungen)
            # nimm nur solche Trennstellen, die in allen Lösungen vorkommen,
            # und für die Qualität nimm die schlechteste.
            loesung = []
            loesung0, andere = loesungen[0], loesungen[1:]
            for i,hp in enumerate(loesung0):
                q = hp.quality
                for a in andere:
                    if q:
                        for hp1 in a:
                            if hp1.indx==hp.indx \
                            and hp1.nl==hp.nl and hp1.sl==hp.sl \
                            and hp1.nr==hp.nr and hp1.sr==hp.sr:
                                q = min(q,hp1.quality)
                                break
                        else:
                            # Trennstelle nicht in der anderen Lösung enthalten
                            q = 0
                if q:
                    loesung.append(HyphenationPoint(hp.indx,q,hp.nl,hp.sl,hp.nr,hp.sr))
        elif len(loesungen)==1:
            loesung = loesungen[0]
            #hword.info = ("HYPHEN_OK", loesung)
            if not loesung:
                pass #hword.info = ("NOT_HYPHENATABLE", aWord)
        else:
            #hword.info = ("UNKNOWN", aWord)
            loesung = []
            #for i in range(len(aWord)):
            for i in range(1,len(aWord)-1):
                if aWord[i] in self.postfixChars and aWord[i+1] not in "0123456789":
                    #print "Trenne", aWord,"an Position:",i,"bei",aWord[i]
                    # in zwei Teile zerlegen und getrennt betrachten
                    r = self.shy
                    if aWord[i] in [self.shy,u"-"]:
                       r = u""
                    loesung1 = self.hyphenate(aWord[:i])
                    loesung1.hyphenations.append (HyphenationPoint(i+1,9,0,r,0,u""))
                    loesung2 = self.hyphenate(aWord[i+1:])
                    # TODO diese Lösungen müssen jetzt zusammengeführt werden.
                    if loesung2.hyphenations == []:
                        #nur der 1. Teil kann getrennt werden
                        loesung = loesung1.hyphenations
                    else:
                        #beide Teile können getrennt werden
                        loesung = loesung1.hyphenations + [HyphenationPoint(hp.indx+i+1,hp.quality,hp.nl,hp.sl,hp.nr,hp.sr) for hp in loesung2.hyphenations]
                    break
            else:
                loesung = BaseHyphenator.hyphenate(self,aWord).hyphenations

        hword.hyphenations = loesung
        #print "hyphenate %s -> %d points" % (aWord,len(loesung))
        return hword
Ejemplo n.º 4
0
    def i_hyphenate_derived(self,aWord):
        """
        You can use this method in classes derived from ExplicitHyphenator.
        It will first split the word using BaseHyphenator,
        then for each "subword" it will call ExplicitHyphenator,
        and only call the derived classes hyph method for the still
        unknown subwords.
        
        TODO: The implementation does not match the docstring
              test: "hohenlimburg.de", "hohenlimburg.de)"
        """
        #print "ExplicitHyphenator.i_hyphenate_derived", aWord
        assert isinstance(aWord, unicode)

        # Helper function
        
        sub_hwords = []
        hword = BaseHyphenator.i_hyphenate(self,aWord)
        #print "BaseHyphenator.i_hyphenate returned %r" % hword
        if hword is None:
            hword = HyphenatedWord(aWord,hyphenations=[])
        base_hyph_points = hword.hyphenations
        last_indx = 0
        nr = 0
        for hpnum, hp in enumerate(base_hyph_points):
            if isinstance(hp, int):
                hp = HyphenationPoint(hp, quality=5, sl=SHY)
            subword = hword[last_indx+nr:hp.indx]
            # handle subword
            if SHY in subword:
                sub_hword = self.stripper.apply_stripped(BaseHyphenator.hyph, self, subword)
            else:
                sub_hword = self.stripper.apply_stripped(ExplicitHyphenator.hyph, self, subword)
            if sub_hword is None:
                sub_hword = self.stripper.apply_stripped(self.hyph, self, subword)
            if sub_hword is None:
                sub_hword = HyphenatedWord(subword, hyphenations=[])
            sub_hwords.append(sub_hword)
            # end handle subword
            last_indx = hp.indx
            nr = hp.nr            
        # Now the last subword
        subword = hword[last_indx:]
        # handle subword
        if SHY in subword:
            sub_hword = self.stripper.apply_stripped(BaseHyphenator.hyph, self, subword)
        else:
            sub_hword = self.stripper.apply_stripped(ExplicitHyphenator.hyph, self, subword)
        if sub_hword is None:
            sub_hword = self.stripper.apply_stripped(self.hyph, self, subword)
        if sub_hword is None:
            sub_hword = HyphenatedWord(subword, hyphenations=[])
        sub_hwords.append(sub_hword)
        #end handle subword
        if len(sub_hwords) > 1:
            return HyphenatedWord.join(sub_hwords)
        else:        
            return sub_hwords[0] # Kann auch None sein.
Ejemplo n.º 5
0
    def hyphenate(self,aWord):
        assert isinstance(aWord, unicode)
        hword = HyphenatedWord(aWord)
        loesungen = self.zerlegeWort(aWord)
        if len(loesungen)>1:
            #hword.info = ("AMBIGUOUS", loesungen)
            # nimm nur solche Trennstellen, die in allen Lösungen vorkommen,
            # und für die Qualität nimm die schlechteste.
            loesung = []
            loesung0, andere = loesungen[0], loesungen[1:]
            for i,hp in enumerate(loesung0):
                q = hp.quality
                for a in andere:
                    if q:
                        for hp1 in a:
                            if hp1.indx==hp.indx \
                            and hp1.nl==hp.nl and hp1.sl==hp.sl \
                            and hp1.nr==hp.nr and hp1.sr==hp.sr:
                                q = min(q,hp1.quality)
                                break
                        else:
                            # Trennstelle nicht in der anderen Lösung enthalten
                            q = 0
                if q:
                    loesung.append(HyphenationPoint(hp.indx,q,hp.nl,hp.sl,hp.nr,hp.sr))
        elif len(loesungen)==1:
            loesung = loesungen[0]
            #hword.info = ("HYPHEN_OK", loesung)
            if not loesung:
                pass #hword.info = ("NOT_HYPHENATABLE", aWord)
        else:
            #hword.info = ("UNKNOWN", aWord)
            loesung = []
            #for i in range(len(aWord)):
            for i in range(1,len(aWord)-1):
                if aWord[i] in self.postfixChars and aWord[i+1] not in "0123456789":
                    #print "Trenne", aWord,"an Position:",i,"bei",aWord[i]
                    # in zwei Teile zerlegen und getrennt betrachten
                    r = self.shy
                    if aWord[i] in [self.shy,u"-"]:
                       r = u""
                    loesung1 = self.hyphenate(aWord[:i])
                    loesung1.hyphenations.append (HyphenationPoint(i+1,9,0,r,0,u""))
                    loesung2 = self.hyphenate(aWord[i+1:])
                    # TODO diese Lösungen müssen jetzt zusammengeführt werden.
                    if loesung2.hyphenations == []:
                        #nur der 1. Teil kann getrennt werden
                        loesung = loesung1.hyphenations
                    else:
                        #beide Teile können getrennt werden
                        loesung = loesung1.hyphenations + [HyphenationPoint(hp.indx+i+1,hp.quality,hp.nl,hp.sl,hp.nr,hp.sr) for hp in loesung2.hyphenations]
                    break
            else:
                loesung = BaseHyphenator.hyphenate(self,aWord).hyphenations

        hword.hyphenations = loesung
        #print "hyphenate %s -> %d points" % (aWord,len(loesung))
        return hword
Ejemplo n.º 6
0
    def i_hyphenate_derived(self,aWord):
        """
        You can use this method in classes derived from ExplicitHyphenator.
        It will first split the word using BaseHyphenator,
        then for each "subword" it will call ExplicitHyphenator,
        and only call the derived classes hyph method for the still
        unknown subwords.
        
        TODO: The implementation does not match the docstring
              test: "hohenlimburg.de", "hohenlimburg.de)"
        """
        #print "ExplicitHyphenator.i_hyphenate_derived", aWord
        assert isinstance(aWord, unicode_type)

        # Helper function
        
        sub_hwords = []
        hword = BaseHyphenator.i_hyphenate(self,aWord)
        #print "BaseHyphenator.i_hyphenate returned %r" % hword
        if hword is None:
            hword = HyphenatedWord(aWord,hyphenations=[])
        base_hyph_points = hword.hyphenations
        last_indx = 0
        nr = 0
        for hpnum, hp in enumerate(base_hyph_points):
            if isinstance(hp, int):
                hp = HyphenationPoint(hp, quality=5, sl=SHY)
            subword = hword[last_indx+nr:hp.indx]
            # handle subword
            if SHY in subword:
                sub_hword = self.stripper.apply_stripped(BaseHyphenator.hyph, self, subword)
            else:
                sub_hword = self.stripper.apply_stripped(ExplicitHyphenator.hyph, self, subword)
            if sub_hword is None:
                sub_hword = self.stripper.apply_stripped(self.hyph, self, subword)
            if sub_hword is None:
                sub_hword = HyphenatedWord(subword, hyphenations=[])
            sub_hwords.append(sub_hword)
            # end handle subword
            last_indx = hp.indx
            nr = hp.nr            
        # Now the last subword
        subword = hword[last_indx:]
        # handle subword
        if SHY in subword:
            sub_hword = self.stripper.apply_stripped(BaseHyphenator.hyph, self, subword)
        else:
            sub_hword = self.stripper.apply_stripped(ExplicitHyphenator.hyph, self, subword)
        if sub_hword is None:
            sub_hword = self.stripper.apply_stripped(self.hyph, self, subword)
        if sub_hword is None:
            sub_hword = HyphenatedWord(subword, hyphenations=[])
        sub_hwords.append(sub_hword)
        #end handle subword
        if len(sub_hwords) > 1:
            return HyphenatedWord.join(sub_hwords)
        else:        
            return sub_hwords[0] # Kann auch None sein.
Ejemplo n.º 7
0
    def __init__ (self, 
                  language="DE",
                  minWordLength=4,
                  qHaupt=8,
                  qNeben=5,
                  qVorsilbe=5,
                  qSchlecht=3,
                  hyphenDir=None,
                  **options
                 ):
        BaseHyphenator.__init__(self,language=language,minWordLength=minWordLength,**options)

        # Qualitäten für verschiedene Trennstellen
        self.qHaupt=qHaupt
        self.qNeben=qNeben
        self.qVorsilbe=qVorsilbe
        self.qSchlecht=qSchlecht
        
        # Stammdaten initialisieren
        self.sonderfaelle = []
Ejemplo n.º 8
0
    def __init__ (self, 
                  language="DE",
                  minWordLength=4,
                  qHaupt=8,
                  qNeben=5,
                  qVorsilbe=5,
                  qSchlecht=3,
                  hyphenDir=None,
                  **options
                 ):
        BaseHyphenator.__init__(self,language=language,minWordLength=minWordLength,**options)

        # Qualitäten für verschiedene Trennstellen
        self.qHaupt=qHaupt
        self.qNeben=qNeben
        self.qVorsilbe=qVorsilbe
        self.qSchlecht=qSchlecht
        
        # Stammdaten initialisieren
        self.sonderfaelle = []