def write(self, file, attributes, root_tag): """Writes to file.""" if file == self._fileid: print("Warning: you were about to write over original file") return root = ElementTree.Element(root_tag) tree = _dict_to_xml(self.data, root, attributes) tree = ElementTree.ElementTree(tree) tree.write(file, encoding='utf-8')
def _dict_to_xml(d, element, attributes): """ Build XML Element of all data in D. Attribute is list of keys in D that should be attributes. :type d: dict :param d: Dictionary to turn into XML :type element: str :param element: Tag of root element :type attributes: list(str) :param attributes: Keys that should be attributes, not children :rtpye: ElementTree :return: All information in D as XML """ for key in d: if key == 'rtext': element.text = d[key] elif key in attributes: element.set(key, d[key]) else: for child_dict in d[key]: subelement = ElementTree.SubElement(element, key) _dict_to_xml(child_dict, subelement, attributes) return element
def _get_sex(self, fileid, speaker): xmldoc = ElementTree.parse(fileid).getroot() for pat in xmldoc.findall('.//{%s}Participants/{%s}participant' % (NS, NS)): try: if pat.get('id') == speaker: sex = pat.get('sex') return sex # some files don't have age data except (TypeError, AttributeError), e: return None
def _get_sex(self, fileid, speaker): xmldoc = ElementTree.parse(fileid).getroot() for pat in xmldoc.findall('.//{%s}Participants/{%s}participant' % (NS,NS)): try: if pat.get('id') == speaker: sex = pat.get('sex') return sex # some files don't have age data except (TypeError, AttributeError), e: return None
def _get_age(self, fileid, speaker, month): xmldoc = ElementTree.parse(fileid).getroot() for pat in xmldoc.findall(".//{%s}Participants/{%s}participant" % (NS, NS)): try: if pat.get("id") == speaker: age = pat.get("age") if month: age = self.convert_age(age) return age # some files don't have age data except (TypeError, AttributeError) as e: return None
def _get_age(self, fileid, speaker, month): xmldoc = ElementTree.parse(fileid).getroot() for pat in xmldoc.findall('.//{%s}Participants/{%s}participant' % (NS, NS)): try: if pat.get('id') == speaker: age = pat.get('age') if month: age = self.convert_age(age) return age # some files don't have age data except (TypeError, AttributeError) as e: return None
def _get_age(self, fileid, speaker, month): xmldoc = ElementTree.parse(fileid).getroot() for pat in xmldoc.findall('.//{%s}Participants/{%s}participant' % (NS, NS)): try: if pat.get('id') == speaker: age = pat.get('age') if month: age = self.convert_age(age) return (age) # some files don't have age data except: return (None)
def _get_participants(self, fileid): # multidimensional dicts def dictOfDicts(): return defaultdict(dictOfDicts) xmldoc = ElementTree.parse(fileid).getroot() # getting participants' data pat = dictOfDicts() for participant in xmldoc.findall( './/{%s}Participants/{%s}participant' % (NS, NS)): for (key, value) in participant.items(): pat[participant.get('id')][key] = value return pat
def _get_participants(self, fileid): # multidimensional dicts def dictOfDicts(): return defaultdict(dictOfDicts) xmldoc = ElementTree.parse(fileid).getroot() # getting participants' data pat = dictOfDicts() for participant in xmldoc.findall('.//{%s}Participants/{%s}participant' % (NS,NS)): for (key,value) in participant.items(): pat[participant.get('id')][key] = value return pat
def _words(self, fileid, bracket_sent, tag, strip_space, stem): """ Helper used to implement the view methods -- returns a list of words or a list of sentences, optionally tagged. :param fileid: The name of the underlying file. :param bracket_sent: If true, include sentence bracketing. :param tag: The name of the tagset to use, or None for no tags. :param strip_space: If true, strip spaces from word tokens. :param stem: If true, then substitute stems for words. """ result = [] xmldoc = ElementTree.parse(fileid).getroot() for xmlsent in xmldoc.findall('.//s'): sent = [] for xmlword in _all_xmlwords_in(xmlsent): word = xmlword.text if not word: word = "" # fixes issue 337? if strip_space or stem: word = word.strip() if stem: word = xmlword.get('hw', word) if tag == 'c5': word = (word, xmlword.get('c5')) if tag == 'ctag': word = (word, xmlword.get('ctag')) elif tag == 'pos': word = (word, xmlword.get('pos', xmlword.get('c5'))) sent.append(word) if bracket_sent: result.append(NNCSentence(xmlsent.attrib['n'], sent)) else: result.extend(sent) assert None not in result return result
def _words(self, fileid, bracket_sent, tag, strip_space, stem): """ Helper used to implement the view methods -- returns a list of words or a list of sentences, optionally tagged. :param fileid: The name of the underlying file. :param bracket_sent: If true, include sentence bracketing. :param tag: The name of the tagset to use, or None for no tags. :param strip_space: If true, strip spaces from word tokens. :param stem: If true, then substitute stems for words. """ result = [] xmldoc = ElementTree.parse(fileid).getroot() for xmlsent in xmldoc.findall('.//s'): sent = [] for xmlword in _all_xmlwords_in(xmlsent): word = xmlword.text if not word: word = "" # fixes issue 337? if strip_space or stem: word = word.strip() if stem: word = xmlword.get('hw', word) if tag == 'c5': word = (word, xmlword.get('c5')) elif tag == 'pos': word = (word, xmlword.get('pos', xmlword.get('c5'))) sent.append(word) if bracket_sent: result.append(BNCSentence(xmlsent.attrib['n'], sent)) else: result.extend(sent) assert None not in result return result
def _get_words(self, fileid, speaker, sent, stem, relation, pos, strip_space, replace): if isinstance(speaker, string_types) and speaker != 'ALL': # ensure we have a list of speakers speaker = [ speaker ] xmldoc = ElementTree.parse(fileid).getroot() # processing each xml doc results = [] for xmlsent in xmldoc.findall('.//{%s}u' % NS): sents = [xmlsent.get('who')] # select speakers if speaker == 'ALL' or xmlsent.get('who') in speaker: for xmlword in xmlsent.findall('.//{%s}w' % NS): infl = None ; suffixStem = None # getting replaced words if replace and xmlsent.find('.//{%s}w/{%s}replacement' % (NS,NS)): xmlword = xmlsent.find('.//{%s}w/{%s}replacement/{%s}w' % (NS,NS,NS)) elif replace and xmlsent.find('.//{%s}w/{%s}wk' % (NS,NS)): xmlword = xmlsent.find('.//{%s}w/{%s}wk' % (NS,NS)) # get text if xmlword.text: word = xmlword.text else: word = '' # strip tailing space if strip_space: word = word.strip() # stem if relation or stem: try: xmlstem = xmlword.find('.//{%s}stem' % NS) word = xmlstem.text except AttributeError as e: pass # if there is an inflection try: xmlinfl = xmlword.find('.//{%s}mor/{%s}mw/{%s}mk' % (NS,NS,NS)) word += '-' + xmlinfl.text except: pass # if there is a suffix try: xmlsuffix = xmlword.find('.//{%s}mor/{%s}mor-post/{%s}mw/{%s}stem' % (NS,NS,NS,NS)) suffixStem = xmlsuffix.text except AttributeError: suffixStem = "" # pos if relation or pos: try: xmlpos = xmlword.findall(".//{%s}c" % NS) xmlpos2 = xmlword.findall(".//{%s}s" % NS) if xmlpos2 != []: tag = xmlpos[0].text+":"+xmlpos2[0].text else: tag = xmlpos[0].text word = (word,tag) except (AttributeError,IndexError) as e: word = (word,None) if suffixStem: suffixStem = (suffixStem,None) # relational # the gold standard is stored in # <mor></mor><mor type="trn"><gra type="grt"> if relation == True: for xmlstem_rel in xmlword.findall('.//{%s}mor/{%s}gra' % (NS,NS)): if not xmlstem_rel.get('type') == 'grt': word = (word[0], word[1], xmlstem_rel.get('index') + "|" + xmlstem_rel.get('head') + "|" + xmlstem_rel.get('relation')) else: word = (word[0], word[1], word[2], word[0], word[1], xmlstem_rel.get('index') + "|" + xmlstem_rel.get('head') + "|" + xmlstem_rel.get('relation')) try: for xmlpost_rel in xmlword.findall('.//{%s}mor/{%s}mor-post/{%s}gra' % (NS,NS,NS)): if not xmlpost_rel.get('type') == 'grt': suffixStem = (suffixStem[0], suffixStem[1], xmlpost_rel.get('index') + "|" + xmlpost_rel.get('head') + "|" + xmlpost_rel.get('relation')) else: suffixStem = (suffixStem[0], suffixStem[1], suffixStem[2], suffixStem[0], suffixStem[1], xmlpost_rel.get('index') + "|" + xmlpost_rel.get('head') + "|" + xmlpost_rel.get('relation')) except: pass sents.append(word) if suffixStem: sents.append(suffixStem) if sent or relation: results.append(sents) else: results.extend(sents) return results
def get_custom_sents( self, fileid ): # speaker, sent, stem, relation, pos, strip_space, replace): fileid = self.abspaths([fileid])[0] tree = ElementTree.parse(fileid) xmldoc = tree.getroot() # check if this file has phonological transcriptions if xmldoc.find('.//{%s}pw' % NS) is not None: fileHasPhonology = True print('File has phonological transcripts. Processing...') else: fileHasPhonology = False print( 'File has no phonological transcripts. Skipping extraction of phonological information.' ) results2 = [] for xmlsent in xmldoc.findall('.//{%s}u' % NS): # TODO confusing tuple structure, use map utt = () sentID = xmlsent.get('uID') sents = [] # place this in map speaker = xmlsent.get('who') # ME utt += (sentID, speaker) tokens = [] token_order = 0 skip_replacement_counter = 0 # extract utterance terminator terminator = xmlsent.find(".//{%s}t" % NS).attrib['type'] utt += (terminator, ) # get dependent tiers / annotations # TODO get a bunch of stuff and return in convenient format annotations = [] annotation_elements = xmlsent.findall(".//{%s}a" % NS) for element in annotation_elements: annotation = {} annotation['type'] = element.attrib.get('type') annotation['flavor'] = element.attrib.get('flavor') annotation['who'] = element.attrib.get('who') annotation['text'] = element.text annotations.append(annotation) utt += (annotations, ) # does this capture the phonetic tier? # extract media info, if it exists media = {} media_element = xmlsent.findall(".//{%s}media" % NS) if media_element: media['start'] = media_element[0].attrib['start'] media['end'] = media_element[0].attrib['end'] media['unit'] = media_element[0].attrib['unit'] utt += (media, ) # Pull out the phonology tiers if fileHasPhonology: actual_pho, model_pho = get_phonology(xmlsent, speaker, sentID, fileid) num_tokens = len(xmlsent.findall('.//{%s}w' % NS)) include_actual_pho = num_tokens == len(actual_pho) include_model_pho = num_tokens == len(model_pho) else: actual_pho = [] model_pho = [] for xmlword in xmlsent.findall('.//{%s}w' % NS): # skip the replacements of a word - they've already been considered if skip_replacement_counter > 0: skip_replacement_counter -= 1 continue token = {} if xmlword.get('type') == 'omission': continue suffixStem = None #xstr = lambda s: "" if s is None else unicode(s) xstr = lambda s: "" if s is None else s if xmlword.find('.//{%s}langs' % (NS)): xmlword.text = xmlword.find('.//{%s}langs' % (NS)).tail # handles compounds and ignores shortenings (?) text_tags = [ "{%s}wk" % NS, "{%s}p" % NS, "{%s}shortening" % NS ] if xmlword.findall('*'): word_tags = xmlword.findall('*') text = xstr(xmlword.text) for word_tag in word_tags: if word_tag.tag in text_tags: if word_tag.tag == "{%s}wk" % NS: text += "+" text += xstr(word_tag.text) + xstr(word_tag.tail) xmlword.text = text if xmlword.text: word = xmlword.text token['gloss'] = xmlword.text.strip() else: print('empty word in sentence ' + str(sentID)) word = '' token['gloss'] = '' # check if this is a replacement, and then build rep, stem, etc from children if xmlword.find('.//{%s}replacement' % (NS)): # save children in replacement field # iterate over children replacements = [] prefix = [] pos = [] stems = [] suffix = [] english = [] clitics = [] relations = [] morpheme_length = None children = xmlword.findall('.//{%s}w' % NS) for child in children: if child.text: replacements.append(child.text) prefix_result, pos_result, stem_result, suffix_result, english_result, clitic_result, morpheme_length_result = \ self._get_morphology(child) if prefix_result: prefix.append(prefix_result) # pos_result = self._get_pos(child, None) if pos_result: pos.append(pos_result) # stem_result = self._get_stem(child) if stem_result: stems.append(stem_result) if suffix_result: suffix.append(suffix_result) if english_result: english.append(english_result) if clitic_result: clitics.append(clitic_result) relation_result = self._get_relation(child) if relation_result: relations.append(relation_result) if morpheme_length_result: if morpheme_length: morpheme_length += morpheme_length_result else: morpheme_length = morpheme_length_result token['replacement'] = ' '.join(replacements) token['prefix'] = ' '.join(prefix) token['pos'] = ' '.join(pos) token['stem'] = ' '.join(stems) token['suffix'] = ' '.join(suffix) token['english'] = ' '.join(english) token['clitic'] = ' '.join(clitics) token['relation'] = ' '.join(relations) token['morpheme_length'] = morpheme_length skip_replacement_counter = len(children) else: # else get stem and pos for this word # word = word.strip() token['prefix'], token['pos'], token['stem'], token['suffix'], token['english'], token['clitic'], token['morpheme_length'] = \ self._get_morphology(xmlword) # token['stem'] = self._get_stem(xmlword) # if suffix, should be in same column # token['pos'] = self._get_pos(xmlword, suffixStem) token['relation'] = self._get_relation(xmlword) # replacement_elems = filter(lambda x: x.tag == '{%s}w' % NS, [e for e in xmlword.iter() if e is not xmlword]) # replacements = [r.text for r in replacement_elems] # replacement_str = ' '.join(replacements) # if replacement_str: # token['replacement'] = replacement_str # skip_replacement_counter = len(replacements) # parent_map = dict((c, p) for p in tree.getiterator() for c in p) # # if parent_map.get(xmlword) and parent_map.get(xmlword).tag == '{%s}replacement' % NS: # last_token = tokens[len(tokens) - 1] # last_token['replacement'] = token['gloss'] # continue # don't save this token in tokens array # strip tailing space token_order += 1 token['order'] = token_order # only include the phonetic information at the word level if it aligns with the set of words if fileHasPhonology: if include_actual_pho: token['pho'] = actual_pho[(token_order - 1)] else: # mismatch in actual_pho and utterance length; not including actual pho at the word level token['pho'] = '' if include_model_pho: token['mod'] = model_pho[(token_order - 1)] else: # mismatch in model_pho and utterance length; not including model pho at the word level token['mod'] = '' else: # whole file does not have phonology token['pho'] = '' token['mod'] = '' tokens.append(token) # if suffixStem: # sents.append(suffixStem) results2.append(utt + (tokens, ) + (actual_pho, ) + (model_pho, )) return results2
def _get_corpus(self, fileid): results = dict() xmldoc = ElementTree.parse(fileid).getroot() for key, value in xmldoc.items(): results[key] = value return results
def _get_words(self, fileid, speaker, sent, stem, relation, pos, strip_space, replace): if isinstance( speaker, str) and speaker != 'ALL': # ensure we have a list of speakers speaker = [speaker] xmldoc = ElementTree.parse(fileid).getroot() # processing each xml doc results = [] for xmlsent in xmldoc.findall('.//{%s}u' % NS): sentID = xmlsent.get('uID') sents = [] # select speakers if speaker == 'ALL' or xmlsent.get('who') in speaker: for xmlword in xmlsent.findall('.//{%s}w' % NS): if xmlword.get('type') == 'omission': continue infl = None suffixStem = None # getting replaced words xstr = lambda s: "" if s is None else unicode(s) if replace and xmlword.find('.//{%s}replacement' % (NS)): continue if xmlword.find('.//{%s}langs' % (NS)): xmlword.text = xmlword.find('.//{%s}langs' % (NS)).tail text_tags = [ "{%s}wk" % NS, "{%s}p" % NS, "{%s}shortening" % NS ] if xmlword.findall('*'): word_tags = xmlword.findall('*') text = xstr(xmlword.text) for word_tag in word_tags: if word_tag.tag in text_tags: if word_tag.tag == "{%s}wk" % NS: text += "+" text += xstr(word_tag.text) + xstr( word_tag.tail) xmlword.text = text if xmlword.text: word = xmlword.text else: print 'empty word in sentence %s' % sentID word = '' # strip tailing space if strip_space: word = word.strip() # stem if relation or stem: try: xmlstem = xmlword.find('.//{%s}stem' % NS) word = xmlstem.text except AttributeError, e: pass # if there is an inflection try: xmlinfl = xmlword.find('.//{%s}mor/{%s}mw/{%s}mk' % (NS, NS, NS)) word += '-' + xmlinfl.text except: pass # if there is a suffix try: xmlsuffix = xmlword.find( './/{%s}mor/{%s}mor-post/{%s}mw/{%s}stem' % (NS, NS, NS, NS)) suffixStem = xmlsuffix.text except AttributeError: suffixStem = "" # pos if relation or pos: try: xmlpos = xmlword.findall(".//{%s}c" % NS) word = (word, xmlpos[0].text) if len(xmlpos) != 1 and suffixStem: suffixStem = (suffixStem, xmlpos[1].text) except (AttributeError, IndexError), e: word = (word, None) if suffixStem: suffixStem = (suffixStem, None) # relational # the gold standard is stored in # <mor></mor><mor type="trn"><gra type="grt"> if relation == True: for xmlstem_rel in xmlword.findall( './/{%s}mor/{%s}gra' % (NS, NS)): if not xmlstem_rel.get('type') == 'grt': word = (word[0], word[1], xmlstem_rel.get('index') + "|" + xmlstem_rel.get('head') + "|" + xmlstem_rel.get('relation')) else: word = (word[0], word[1], word[2], word[0], word[1], xmlstem_rel.get('index') + "|" + xmlstem_rel.get('head') + "|" + xmlstem_rel.get('relation')) try: for xmlpost_rel in xmlword.findall( './/{%s}mor/{%s}mor-post/{%s}gra' % (NS, NS, NS)): if not xmlpost_rel.get('type') == 'grt': suffixStem = (suffixStem[0], suffixStem[1], xmlpost_rel.get('index') + "|" + xmlpost_rel.get('head') + "|" + xmlpost_rel.get('relation')) else: suffixStem = (suffixStem[0], suffixStem[1], suffixStem[2], suffixStem[0], suffixStem[1], xmlpost_rel.get('index') + "|" + xmlpost_rel.get('head') + "|" + xmlpost_rel.get('relation')) except: pass sents.append(word) if suffixStem: sents.append(suffixStem) if sent or relation: results.append(sents) else: results.extend(sents)
def get_custom_sents( self, fileid ): # speaker, sent, stem, relation, pos, strip_space, replace): fileid = self.abspaths([fileid])[0] tree = ElementTree.parse(fileid) xmldoc = tree.getroot() # processing each xml doc # results = [] results2 = [] for xmlsent in xmldoc.findall('.//{%s}u' % NS): utt = () sentID = xmlsent.get('uID') sents = [] # place this in map speaker = xmlsent.get('who') # ME utt += (sentID, speaker) tokens = [] token_order = 0 skip_replacement_counter = 0 for xmlword in xmlsent.findall('.//{%s}w' % NS): # skip the replacements of a word - they've already been considered if skip_replacement_counter > 0: skip_replacement_counter -= 1 continue token = {} if xmlword.get('type') == 'omission': continue suffixStem = None xstr = lambda s: "" if s is None else unicode(s) if xmlword.find('.//{%s}langs' % (NS)): xmlword.text = xmlword.find('.//{%s}langs' % (NS)).tail # handles compounds and ignores shortenings (?) text_tags = [ "{%s}wk" % NS, "{%s}p" % NS, "{%s}shortening" % NS ] if xmlword.findall('*'): word_tags = xmlword.findall('*') text = xstr(xmlword.text) for word_tag in word_tags: if word_tag.tag in text_tags: if word_tag.tag == "{%s}wk" % NS: text += "+" text += xstr(word_tag.text) + xstr(word_tag.tail) xmlword.text = text if xmlword.text: word = xmlword.text token['gloss'] = xmlword.text.strip() else: print 'empty word in sentence %s' % sentID word = '' token['gloss'] = '' # check if this is a replacement, and then build rep, stem, etc from children if xmlword.find('.//{%s}replacement' % (NS)): # save children in replacement field # iterate over children replacements = [] stems = [] pos = [] relations = [] children = xmlword.findall('.//{%s}w' % NS) for child in children: replacements.append(child.text) stems.append(self._get_stem(child)) pos.append(self._get_pos(child, None)) relations.append(self._get_relation(child)) token['replacement'] = ' '.join(replacements) token['stem'] = ' '.join(stems) token['pos'] = ' '.join(pos) token['relation'] = ' '.join(relations) skip_replacement_counter = len(children) else: # else get stem and pos for this word # word = word.strip() token['stem'] = self._get_stem( xmlword) # if suffix, should be in same column token['pos'] = self._get_pos(xmlword, suffixStem) token['relation'] = self._get_relation(xmlword) # replacement_elems = filter(lambda x: x.tag == '{%s}w' % NS, [e for e in xmlword.iter() if e is not xmlword]) # replacements = [r.text for r in replacement_elems] # replacement_str = ' '.join(replacements) # if replacement_str: # token['replacement'] = replacement_str # skip_replacement_counter = len(replacements) # parent_map = dict((c, p) for p in tree.getiterator() for c in p) # # if parent_map.get(xmlword) and parent_map.get(xmlword).tag == '{%s}replacement' % NS: # last_token = tokens[len(tokens) - 1] # last_token['replacement'] = token['gloss'] # continue # don't save this token in tokens array # strip tailing space token_order += 1 token['order'] = token_order # sents.append(word) tokens.append(token) # if suffixStem: # sents.append(suffixStem) results2.append(utt + (tokens, )) return results2
for overlap in gwrap.findall('.//{%s}overlap' % NS): if overlap.get('type') == type: return True return False def getAgeFromFileName(file): chunks = file.split('-') year = chunks[0] month = chunks[1] months = (int(year) * 12) + int(month) return months for file in thomas.fileids(): xmldoc = ElementTree.parse(file).getroot() results = [] with open('thomas_utterance_data.csv', 'a') as csvfile: fieldnames = [ 'utterance', 'response', 'speaker', 'responder', 'error', 'age', 'past_tense', 'plural', 'source_file', 'utterance_start', 'utterance_end', 'response_start', 'response_end' ] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) i = 0 sents = xmldoc.findall('.//{%s}u' % NS) while i + 1 < len(sents): data = {} data['utterance'] = getUtterance(sents[i]) data['response'] = getUtterance(sents[i + 1]) data['speaker'] = sents[i].get('who')
import nltk from nltk.corpus.reader.xmldocs import ElementTree NS = "http://www.talkbank.org/ns/talkbank" xstr = lambda s: "" if s is None else str(s) corpus_root = nltk.data.find("corpora/childes/data-xml/English/Eng-NA-MOR/") reload(childes) from childes import CHILDESCorpusReader corpus_reader = CHILDESCorpusReader(corpus_root, r"Providence/William/wil11.xml") words = corpus_reader.words(replace=True) freqs = nltk.FreqDist(words) fileid = corpus_reader.fileids()[0] xmldoc = ElementTree.parse(corpus_root + "/" + fileid).getroot() xmlsents = xmldoc.findall(".//{%s}u" % NS) xmlsent = xmlsents[173] xmlwords = xmlsent.findall(".//{%s}w" % NS) xmlword = xmlwords[2] shortenings = xmlword.findall(".//{%s}shortening" % (NS)) text = xstr(xmlword.text) + "".join([xstr(short.text) + xstr(short.tail) for short in shortenings])
def _get_words(self, fileid, speaker, sent, stem, relation, pos, strip_space, replace): if isinstance( speaker, string_types ) and speaker != 'ALL': # ensure we have a list of speakers speaker = [speaker] xmldoc = ElementTree.parse(fileid).getroot() # processing each xml doc results = [] for xmlsent in xmldoc.findall('.//{%s}u' % NS): sents = [] # select speakers if speaker == 'ALL' or xmlsent.get('who') in speaker: for xmlword in xmlsent.findall('.//{%s}w' % NS): infl = None suffixStem = None # getting replaced words if replace and xmlsent.find('.//{%s}w/{%s}replacement' % (NS, NS)): xmlword = xmlsent.find( './/{%s}w/{%s}replacement/{%s}w' % (NS, NS, NS)) elif replace and xmlsent.find('.//{%s}w/{%s}wk' % (NS, NS)): xmlword = xmlsent.find('.//{%s}w/{%s}wk' % (NS, NS)) # get text if xmlword.text: word = xmlword.text else: word = '' # strip tailing space if strip_space: word = word.strip() # stem if relation or stem: try: xmlstem = xmlword.find('.//{%s}stem' % NS) word = xmlstem.text except AttributeError as e: pass # if there is an inflection try: xmlinfl = xmlword.find('.//{%s}mor/{%s}mw/{%s}mk' % (NS, NS, NS)) word += '-' + xmlinfl.text except: pass # if there is a suffix try: xmlsuffix = xmlword.find( './/{%s}mor/{%s}mor-post/{%s}mw/{%s}stem' % (NS, NS, NS, NS)) suffixStem = xmlsuffix.text except AttributeError: suffixStem = "" # pos if relation or pos: try: xmlpos = xmlword.findall(".//{%s}c" % NS) xmlpos2 = xmlword.findall(".//{%s}s" % NS) if xmlpos2 != []: tag = xmlpos[0].text + ":" + xmlpos2[0].text else: tag = xmlpos[0].text word = (word, tag) except (AttributeError, IndexError) as e: word = (word, None) if suffixStem: suffixStem = (suffixStem, None) # relational # the gold standard is stored in # <mor></mor><mor type="trn"><gra type="grt"> if relation == True: for xmlstem_rel in xmlword.findall( './/{%s}mor/{%s}gra' % (NS, NS)): if not xmlstem_rel.get('type') == 'grt': word = (word[0], word[1], xmlstem_rel.get('index') + "|" + xmlstem_rel.get('head') + "|" + xmlstem_rel.get('relation')) else: word = (word[0], word[1], word[2], word[0], word[1], xmlstem_rel.get('index') + "|" + xmlstem_rel.get('head') + "|" + xmlstem_rel.get('relation')) try: for xmlpost_rel in xmlword.findall( './/{%s}mor/{%s}mor-post/{%s}gra' % (NS, NS, NS)): if not xmlpost_rel.get('type') == 'grt': suffixStem = (suffixStem[0], suffixStem[1], xmlpost_rel.get('index') + "|" + xmlpost_rel.get('head') + "|" + xmlpost_rel.get('relation')) else: suffixStem = (suffixStem[0], suffixStem[1], suffixStem[2], suffixStem[0], suffixStem[1], xmlpost_rel.get('index') + "|" + xmlpost_rel.get('head') + "|" + xmlpost_rel.get('relation')) except: pass sents.append(word) if suffixStem: sents.append(suffixStem) if sent or relation: results.append(sents) else: results.extend(sents) return results
def _get_words(self, fileid, speaker, sent, stem, relation, pos, strip_space, replace): # ensure we have a list of speakers if isinstance(speaker, string_types) and speaker != 'ALL': speaker = [speaker] xmldoc = ElementTree.parse(fileid).getroot() # processing each sentence in xml doc results = [] for xmlsent in xmldoc.findall(sent_node): sents = [] # select speakers if speaker == 'ALL' or xmlsent.get('who') in speaker: # process each word for xml_word in xmlsent.findall(word_node): clitic_stem = None # get replaced words if replace: xml_word = get_replaced_word(xmlsent, xml_word) # get text if xml_word.text: word = xml_word.text else: word = '' # strip tailing space if strip_space: word = word.strip() # get stemmed words if stem: try: xmlstem = xml_word.find(stem_node) word = xmlstem.text except AttributeError: pass # if there is an inflection try: word = add_inflection(xml_word, word) except: pass # if there is a clitic try: xmlclitic = xml_word.find(clitic_node) clitic_stem = xmlclitic.text except AttributeError: clitic_stem = '' # get pos if pos: try: tag = get_pos_tag(xml_word) word = (word, tag) except (AttributeError, IndexError): word = (word, None) if clitic_stem: # add clitic's pos tag if there is one # in the parent class method, this branch does not fetch the clitic -- this is changed here clitic_pos = xml_word.find(clitic_pos_tag_node) if clitic_pos is not None: clitic_stem = (clitic_stem, clitic_pos.get('relation')) else: clitic_stem = (clitic_stem, None) sents.append(word) if clitic_stem: sents.append(clitic_stem) if sent: results.append(sents) else: results.extend(sents) return results
def _get_words(self, fileid, speaker, sent, stem, relation, pos, strip_space, replace): if isinstance(speaker, str) and speaker != 'ALL': # ensure we have a list of speakers speaker = [ speaker ] xmldoc = ElementTree.parse(fileid).getroot() # processing each xml doc results = [] for xmlsent in xmldoc.findall('.//{%s}u' % NS): sentID = xmlsent.get('uID') sents = [] # select speakers if speaker == 'ALL' or xmlsent.get('who') in speaker: for xmlword in xmlsent.findall('.//{%s}w' % NS): if xmlword.get('type') == 'omission': continue infl = None ; suffixStem = None # getting replaced words xstr = lambda s: "" if s is None else unicode(s) if replace and xmlword.find('.//{%s}replacement' % (NS)): continue if xmlword.find('.//{%s}langs' % (NS)): xmlword.text = xmlword.find('.//{%s}langs' % (NS)).tail text_tags = ["{%s}wk" % NS, "{%s}p" % NS, "{%s}shortening" % NS] if xmlword.findall('*'): word_tags = xmlword.findall('*') text = xstr(xmlword.text) for word_tag in word_tags: if word_tag.tag in text_tags: if word_tag.tag == "{%s}wk" % NS: text += "+" text += xstr(word_tag.text) + xstr(word_tag.tail) xmlword.text = text if xmlword.text: word = xmlword.text else: print 'empty word in sentence %s' % sentID word = '' # strip tailing space if strip_space: word = word.strip() # stem if relation or stem: try: xmlstem = xmlword.find('.//{%s}stem' % NS) word = xmlstem.text except AttributeError, e: pass # if there is an inflection try: xmlinfl = xmlword.find('.//{%s}mor/{%s}mw/{%s}mk' % (NS,NS,NS)) word += '-' + xmlinfl.text except: pass # if there is a suffix try: xmlsuffix = xmlword.find('.//{%s}mor/{%s}mor-post/{%s}mw/{%s}stem' % (NS,NS,NS,NS)) suffixStem = xmlsuffix.text except AttributeError: suffixStem = "" # pos if relation or pos: try: xmlpos = xmlword.findall(".//{%s}c" % NS) word = (word,xmlpos[0].text) if len(xmlpos) != 1 and suffixStem: suffixStem = (suffixStem,xmlpos[1].text) except (AttributeError,IndexError), e: word = (word,None) if suffixStem: suffixStem = (suffixStem,None) # relational # the gold standard is stored in # <mor></mor><mor type="trn"><gra type="grt"> if relation == True: for xmlstem_rel in xmlword.findall('.//{%s}mor/{%s}gra' % (NS,NS)): if not xmlstem_rel.get('type') == 'grt': word = (word[0], word[1], xmlstem_rel.get('index') + "|" + xmlstem_rel.get('head') + "|" + xmlstem_rel.get('relation')) else: word = (word[0], word[1], word[2], word[0], word[1], xmlstem_rel.get('index') + "|" + xmlstem_rel.get('head') + "|" + xmlstem_rel.get('relation')) try: for xmlpost_rel in xmlword.findall('.//{%s}mor/{%s}mor-post/{%s}gra' % (NS,NS,NS)): if not xmlpost_rel.get('type') == 'grt': suffixStem = (suffixStem[0], suffixStem[1], xmlpost_rel.get('index') + "|" + xmlpost_rel.get('head') + "|" + xmlpost_rel.get('relation')) else: suffixStem = (suffixStem[0], suffixStem[1], suffixStem[2], suffixStem[0], suffixStem[1], xmlpost_rel.get('index') + "|" + xmlpost_rel.get('head') + "|" + xmlpost_rel.get('relation')) except: pass sents.append(word) if suffixStem: sents.append(suffixStem) if sent or relation: results.append(sents) else: results.extend(sents)
def _get_words(self, fileid, speaker, sent, stem, relation, pos, strip_space, replace): if (isinstance(speaker, string_types) and speaker != "ALL"): # ensure we have a list of speakers speaker = [speaker] xmldoc = ElementTree.parse(fileid).getroot() # processing each xml doc results = [] for xmlsent in xmldoc.findall(".//{%s}u" % NS): sents = [] # select speakers if speaker == "ALL" or xmlsent.get("who") in speaker: for xmlword in xmlsent.findall(".//{%s}w" % NS): infl = None suffixStem = None suffixTag = None # getting replaced words if replace and xmlsent.find(".//{%s}w/{%s}replacement" % (NS, NS)): xmlword = xmlsent.find( ".//{%s}w/{%s}replacement/{%s}w" % (NS, NS, NS)) elif replace and xmlsent.find(".//{%s}w/{%s}wk" % (NS, NS)): xmlword = xmlsent.find(".//{%s}w/{%s}wk" % (NS, NS)) # get text if xmlword.text: word = xmlword.text else: word = "" # strip tailing space if strip_space: word = word.strip() # stem if relation or stem: try: xmlstem = xmlword.find(".//{%s}stem" % NS) word = xmlstem.text except AttributeError as e: pass # if there is an inflection try: xmlinfl = xmlword.find(".//{%s}mor/{%s}mw/{%s}mk" % (NS, NS, NS)) word += "-" + xmlinfl.text except: pass # if there is a suffix try: xmlsuffix = xmlword.find( ".//{%s}mor/{%s}mor-post/{%s}mw/{%s}stem" % (NS, NS, NS, NS)) suffixStem = xmlsuffix.text except AttributeError: suffixStem = "" if suffixStem: word += "~" + suffixStem # pos if relation or pos: try: xmlpos = xmlword.findall(".//{%s}c" % NS) xmlpos2 = xmlword.findall(".//{%s}s" % NS) if xmlpos2 != []: tag = xmlpos[0].text + ":" + xmlpos2[0].text else: tag = xmlpos[0].text except (AttributeError, IndexError) as e: tag = "" try: xmlsuffixpos = xmlword.findall( ".//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}c" % (NS, NS, NS, NS, NS)) xmlsuffixpos2 = xmlword.findall( ".//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}s" % (NS, NS, NS, NS, NS)) if xmlsuffixpos2: suffixTag = (xmlsuffixpos[0].text + ":" + xmlsuffixpos2[0].text) else: suffixTag = xmlsuffixpos[0].text except: pass if suffixTag: tag += "~" + suffixTag word = (word, tag) # relational # the gold standard is stored in # <mor></mor><mor type="trn"><gra type="grt"> if relation == True: for xmlstem_rel in xmlword.findall( ".//{%s}mor/{%s}gra" % (NS, NS)): if not xmlstem_rel.get("type") == "grt": word = ( word[0], word[1], xmlstem_rel.get("index") + "|" + xmlstem_rel.get("head") + "|" + xmlstem_rel.get("relation"), ) else: word = ( word[0], word[1], word[2], word[0], word[1], xmlstem_rel.get("index") + "|" + xmlstem_rel.get("head") + "|" + xmlstem_rel.get("relation"), ) try: for xmlpost_rel in xmlword.findall( ".//{%s}mor/{%s}mor-post/{%s}gra" % (NS, NS, NS)): if not xmlpost_rel.get("type") == "grt": suffixStem = ( suffixStem[0], suffixStem[1], xmlpost_rel.get("index") + "|" + xmlpost_rel.get("head") + "|" + xmlpost_rel.get("relation"), ) else: suffixStem = ( suffixStem[0], suffixStem[1], suffixStem[2], suffixStem[0], suffixStem[1], xmlpost_rel.get("index") + "|" + xmlpost_rel.get("head") + "|" + xmlpost_rel.get("relation"), ) except: pass sents.append(word) if sent or relation: results.append(sents) else: results.extend(sents) return LazyMap(lambda x: x, results)