Exemple #1
0
 def handleWord(self, word):
     #logger.debug(util.funcName('begin'))
     wordText1 = ""
     wordText2 = ""
     punct = None
     is_first_text = True
     for childNode in word.childNodes:
         if not childNode.attributes:
             continue
         elif childNode.getAttribute("type") == "txt":
             text = xmlutil.getElemText(childNode)
             if is_first_text:
                 wordText1 = text
                 is_first_text = False
             else:
                 wordText2 = text
         elif childNode.getAttribute("type") == "punct":
             punct = xmlutil.getElemText(childNode)
             break
     if punct:
         if self.ex.wordList:
             self.ex.addPunctuation(punct)
         else:
             self.ex.appendWord(punct, punct)
         #logger.debug(util.funcName('return', args=punct))
         return
     morphemes = word.getElementsByTagName("morph")
     if len(morphemes):
         self.handleWordMorphemes(morphemes)
     else:
         self.ex.appendMorphObj(singleMorphemeWord(word))
     self.ex.appendWord(wordText1, wordText2)
Exemple #2
0
def singleMorphemeWord(word):
    """For words consisting of a single morpheme, get word-level
    attributes instead of morpheme-level.
    """
    morph = lingex_structs.LingGramMorph()
    items = word.getElementsByTagName("item")
    for item in items:
        if item.attributes is None:
            continue
        itemType = item.getAttribute("type")
        if itemType == "gls":
            if morph.gloss and not morph.text1:
                morph.text1 = morph.gloss
            morph.gloss = xmlutil.getElemText(item)
        elif itemType == "msa":
            morph.pos = xmlutil.getElemText(item)
    return morph
Exemple #3
0
 def handleSentence(self, sentence):
     logger.debug(util.funcName('begin'))
     if self.use_segnum:
         for childNode in sentence.childNodes:
             if not childNode.attributes:
                 continue
             if childNode.getAttribute("type") == "segnum":
                 self.ex.refText = xmlutil.getElemText(childNode).strip()
                 break
     words = sentence.getElementsByTagName("word")
     for childNode in sentence.childNodes:
         if childNode.attributes is None:
             continue
         if childNode.getAttribute("type") == "gls":
             self.ex.freeTrans = xmlutil.getElemText(childNode)
     for word in words:
         self.handleWord(word)
     if self.prefix:
         self.ex.refText = self.prefix + self.ex.refText
Exemple #4
0
    def handleWordMorphemes(self, morphemes):
        #logger.debug(util.funcName('begin'))
        mergedMorphemes = MergedMorphemes()
        for morpheme in morphemes:
            items = morpheme.getElementsByTagName("item")
            morph = lingex_structs.LingGramMorph()
            is_first_text = True
            for item in items:
                if item.attributes is None:
                    continue
                itemType = item.getAttribute("type")
                if itemType == "txt":
                    text = xmlutil.getElemText(item)
                    if is_first_text:
                        morph.text1 = text
                        is_first_text = False
                    else:
                        morph.text2 = text
                elif itemType == "cf":
                    # lex entry, typically same as morph text
                    pass
                elif itemType == "gls":
                    morph.gloss = xmlutil.getElemText(item)
                elif itemType == "msa":
                    morph.pos = xmlutil.getElemText(item)

            if self.config.separateMorphColumns:
                ## store each morpheme separately
                #logger.debug(morph.text)
                self.ex.appendMorphObj(morph)
            else:
                #logger.debug(morph.text)
                mergedMorphemes.add(morph)
        if not self.config.separateMorphColumns:
            self.ex.appendMorphObj(
                mergedMorphemes.getMorph(self.config.get_showMorphemeBreaks()))