def handleWord(self, word): #logger.debug(util.funcName('begin')) wordText1 = "" wordText2 = "" punct = None is_first_text = True for childNode in word.childNodes: if not childNode.attributes: continue elif childNode.getAttribute("type") == "txt": text = xmlutil.getElemText(childNode) if is_first_text: wordText1 = text is_first_text = False else: wordText2 = text elif childNode.getAttribute("type") == "punct": punct = xmlutil.getElemText(childNode) break if punct: if self.ex.wordList: self.ex.addPunctuation(punct) else: self.ex.appendWord(punct, punct) #logger.debug(util.funcName('return', args=punct)) return morphemes = word.getElementsByTagName("morph") if len(morphemes): self.handleWordMorphemes(morphemes) else: self.ex.appendMorphObj(singleMorphemeWord(word)) self.ex.appendWord(wordText1, wordText2)
def singleMorphemeWord(word): """For words consisting of a single morpheme, get word-level attributes instead of morpheme-level. """ morph = lingex_structs.LingGramMorph() items = word.getElementsByTagName("item") for item in items: if item.attributes is None: continue itemType = item.getAttribute("type") if itemType == "gls": if morph.gloss and not morph.text1: morph.text1 = morph.gloss morph.gloss = xmlutil.getElemText(item) elif itemType == "msa": morph.pos = xmlutil.getElemText(item) return morph
def handleSentence(self, sentence): logger.debug(util.funcName('begin')) if self.use_segnum: for childNode in sentence.childNodes: if not childNode.attributes: continue if childNode.getAttribute("type") == "segnum": self.ex.refText = xmlutil.getElemText(childNode).strip() break words = sentence.getElementsByTagName("word") for childNode in sentence.childNodes: if childNode.attributes is None: continue if childNode.getAttribute("type") == "gls": self.ex.freeTrans = xmlutil.getElemText(childNode) for word in words: self.handleWord(word) if self.prefix: self.ex.refText = self.prefix + self.ex.refText
def handleWordMorphemes(self, morphemes): #logger.debug(util.funcName('begin')) mergedMorphemes = MergedMorphemes() for morpheme in morphemes: items = morpheme.getElementsByTagName("item") morph = lingex_structs.LingGramMorph() is_first_text = True for item in items: if item.attributes is None: continue itemType = item.getAttribute("type") if itemType == "txt": text = xmlutil.getElemText(item) if is_first_text: morph.text1 = text is_first_text = False else: morph.text2 = text elif itemType == "cf": # lex entry, typically same as morph text pass elif itemType == "gls": morph.gloss = xmlutil.getElemText(item) elif itemType == "msa": morph.pos = xmlutil.getElemText(item) if self.config.separateMorphColumns: ## store each morpheme separately #logger.debug(morph.text) self.ex.appendMorphObj(morph) else: #logger.debug(morph.text) mergedMorphemes.add(morph) if not self.config.separateMorphColumns: self.ex.appendMorphObj( mergedMorphemes.getMorph(self.config.get_showMorphemeBreaks()))