コード例 #1
0
ファイル: childes.py プロジェクト: Joselin/nltk
    def _get_participants(self, fileid):
        # multidimensional dicts
        def dictOfDicts():
            return defaultdict(dictOfDicts)

        xmldoc = ElementTree.parse(fileid).getroot()
        # getting participants' data
        pat = dictOfDicts()
        for participant in xmldoc.findall('.//{%s}Participants/{%s}participant' % (NS,NS)):
            for (key,value) in participant.items():
                pat[participant.get('id')][key] = value
        return pat
コード例 #2
0
ファイル: childes.py プロジェクト: Joselin/nltk
 def _get_age(self, fileid, month):
     xmldoc = ElementTree.parse(fileid).getroot()
     for pat in xmldoc.findall('.//{%s}Participants/{%s}participant' % (NS,NS)):
         try:
             if pat.get('id') == 'CHI':
                 age = pat.get('age')
                 if month:
                     age = self._convert_age(age)
                 return age
         # some files don't have age data
         except (TypeError, AttributeError), e:
             return None
コード例 #3
0
 def _get_age(self, fileid, month):
     xmldoc = ElementTree.parse(fileid).getroot()
     for pat in xmldoc.findall('.//{%s}Participants/{%s}participant' %
                               (NS, NS)):
         try:
             if pat.get('id') == 'CHI':
                 age = pat.get('age')
                 if month:
                     age = _convert_age(age)
                 return age
         # some files don't have age data
         except (TypeError, AttributeError), e:
             return None
コード例 #4
0
    def _get_participants(self, fileid):
        # multidimensional dicts
        def dictOfDicts():
            return defaultdict(dictOfDicts)

        xmldoc = ElementTree.parse(fileid).getroot()
        # getting participants' data
        pat = dictOfDicts()
        for participant in xmldoc.findall(
                './/{%s}Participants/{%s}participant' % (NS, NS)):
            for (key, value) in participant.items():
                pat[participant.get('id')][key] = value
        return pat
コード例 #5
0
ファイル: childes.py プロジェクト: Joselin/nltk
 def _get_words(self, fileid, speaker, sent, stem, relation, pos,
         strip_space, replace):
     xmldoc = ElementTree.parse(fileid).getroot()
     # processing each xml doc
     results = []
     for xmlsent in xmldoc.findall('.//{%s}u' % NS):
         sents = []
         # select speakers
         if speaker == 'ALL' or xmlsent.get('who') in speaker:
             for xmlword in xmlsent.findall('.//{%s}w' % NS):
                 infl = None ; suffixStem = None
                 # getting replaced words
                 if replace and xmlsent.find('.//{%s}w/{%s}replacement' % (NS,NS)):
                     xmlword = xmlsent.find('.//{%s}w/{%s}replacement/{%s}w' % (NS,NS,NS))
                 elif replace and xmlsent.find('.//{%s}w/{%s}wk' % (NS,NS)):
                     xmlword = xmlsent.find('.//{%s}w/{%s}wk' % (NS,NS))
                 # get text
                 if xmlword.text:
                     word = xmlword.text
                 else:
                     word = ''
                 # strip tailing space
                 if strip_space:
                     word = word.strip()
                 # stem
                 if relation or stem:
                     try:
                         xmlstem = xmlword.find('.//{%s}stem' % NS)
                         word = xmlstem.text
                     except AttributeError, e:
                         pass
                     # if there is an inflection
                     try:
                         xmlinfl = xmlword.find('.//{%s}mor/{%s}mw/{%s}mk' % (NS,NS,NS))
                         word += '-' + xmlinfl.text
                     except:
                         pass
                     # if there is a suffix
                     try:
                         xmlsuffix = xmlword.find('.//{%s}mor/{%s}mor-post/{%s}mw/{%s}stem' % (NS,NS,NS,NS))
                         suffixStem = xmlsuffix.text
                     except AttributeError:
                         suffixStem = ""
                 # pos
                 if relation or pos:
                     try:
                         xmlpos = xmlword.findall(".//{%s}c" % NS)
                         word = (word,xmlpos[0].text)
                         if len(xmlpos) != 1 and suffixStem:
                             suffixStem = (suffixStem,xmlpos[1].text)
                     except (AttributeError,IndexError), e:
                         word = (word,None)
                         if suffixStem:
                             suffixStem = (suffixStem,None)
                 # relational
                 # the gold standard is stored in <mor></mor><mor type="trn"><gra type="grt">
                 if relation == True:
                     for xmlstem_rel in xmlword.findall('.//{%s}mor/{%s}gra' % (NS,NS)):
                         if not xmlstem_rel.get('type') == 'grt':
                             word = (word[0],word[1],xmlstem_rel.get('index')+"|"+xmlstem_rel.get('head')+
                                     "|"+xmlstem_rel.get('relation'))
                         else:
                             word = (word[0],word[1],word[2],word[0],word[1],xmlstem_rel.get('index')+"|"+
                                     xmlstem_rel.get('head')+"|"+xmlstem_rel.get('relation'))
                     try:
                         for xmlpost_rel in xmlword.findall('.//{%s}mor/{%s}mor-post/{%s}gra' % (NS,NS,NS)):
                             if not xmlpost_rel.get('type') == 'grt':
                                 suffixStem = (suffixStem[0],suffixStem[1],xmlpost_rel.get('index')+"|"+xmlpost_rel.get('head')+
                                     "|"+xmlpost_rel.get('relation'))
                             else:
                                 suffixStem = (suffixStem[0],suffixStem[1],suffixStem[2],suffixStem[0],suffixStem[1],xmlpost_rel.get('index')+"|"+xmlpost_rel.get('head')+
                                     "|"+xmlpost_rel.get('relation'))
                     except:
                         pass
                 sents.append(word)
                 if suffixStem:
                     sents.append(suffixStem)
             if sent or relation:
                 results.append(sents)
             else:
                 results.extend(sents)
コード例 #6
0
ファイル: childes.py プロジェクト: Joselin/nltk
 def _get_corpus(self, fileid):
     results = dict()
     xmldoc = ElementTree.parse(fileid).getroot()
     for key, value in xmldoc.items():
         results[key] = value
     return results
コード例 #7
0
 def _get_words(self, fileid, speaker, sent, stem, relation, pos,
                strip_space, replace):
     xmldoc = ElementTree.parse(fileid).getroot()
     # processing each xml doc
     results = []
     for xmlsent in xmldoc.findall('.//{%s}u' % NS):
         sents = []
         # select speakers
         if speaker == 'ALL' or xmlsent.get('who') == speaker:
             for xmlword in xmlsent.findall('.//{%s}w' % NS):
                 infl = None
                 suffixStem = None
                 # getting replaced words
                 if replace and xmlsent.find('.//{%s}w/{%s}replacement' %
                                             (NS, NS)):
                     xmlword = xmlsent.find(
                         './/{%s}w/{%s}replacement/{%s}w' % (NS, NS, NS))
                 elif replace and xmlsent.find('.//{%s}w/{%s}wk' %
                                               (NS, NS)):
                     xmlword = xmlsent.find('.//{%s}w/{%s}wk' % (NS, NS))
                 # get text
                 if xmlword.text:
                     word = xmlword.text
                 else:
                     word = ''
                 # strip tailing space
                 if strip_space:
                     word = word.strip()
                 # stem
                 if relation or stem:
                     try:
                         xmlstem = xmlword.find('.//{%s}stem' % NS)
                         word = xmlstem.text
                     except AttributeError, e:
                         pass
                     # if there is an inflection
                     try:
                         xmlinfl = xmlword.find('.//{%s}mor/{%s}mw/{%s}mk' %
                                                (NS, NS, NS))
                         word += '-' + xmlinfl.text
                     except:
                         pass
                     # if there is a suffix
                     try:
                         xmlsuffix = xmlword.find(
                             './/{%s}mor/{%s}mor-post/{%s}mw/{%s}stem' %
                             (NS, NS, NS, NS))
                         suffixStem = xmlsuffix.text
                     except AttributeError:
                         suffixStem = ""
                 # pos
                 if relation or pos:
                     try:
                         xmlpos = xmlword.findall(".//{%s}c" % NS)
                         word = (word, xmlpos[0].text)
                         if len(xmlpos) != 1 and suffixStem:
                             suffixStem = (suffixStem, xmlpos[1].text)
                     except (AttributeError, IndexError), e:
                         word = (word, None)
                         if suffixStem:
                             suffixStem = (suffixStem, None)
                 # relational
                 # the gold standard is stored in <mor></mor><mor type="trn"><gra type="grt">
                 if relation == True:
                     for xmlstem_rel in xmlword.findall(
                             './/{%s}mor/{%s}gra' % (NS, NS)):
                         if not xmlstem_rel.get('type') == 'grt':
                             word = (word[0], word[1],
                                     xmlstem_rel.get('index') + "|" +
                                     xmlstem_rel.get('head') + "|" +
                                     xmlstem_rel.get('relation'))
                         else:
                             word = (word[0], word[1], word[2], word[0],
                                     word[1], xmlstem_rel.get('index') +
                                     "|" + xmlstem_rel.get('head') + "|" +
                                     xmlstem_rel.get('relation'))
                     try:
                         for xmlpost_rel in xmlword.findall(
                                 './/{%s}mor/{%s}mor-post/{%s}gra' %
                             (NS, NS, NS)):
                             if not xmlpost_rel.get('type') == 'grt':
                                 suffixStem = (suffixStem[0], suffixStem[1],
                                               xmlpost_rel.get('index') +
                                               "|" +
                                               xmlpost_rel.get('head') +
                                               "|" +
                                               xmlpost_rel.get('relation'))
                             else:
                                 suffixStem = (suffixStem[0], suffixStem[1],
                                               suffixStem[2], suffixStem[0],
                                               suffixStem[1],
                                               xmlpost_rel.get('index') +
                                               "|" +
                                               xmlpost_rel.get('head') +
                                               "|" +
                                               xmlpost_rel.get('relation'))
                     except:
                         pass
                 sents.append(word)
                 if suffixStem:
                     sents.append(suffixStem)
             if sent or relation:
                 results.append(sents)
             else:
                 results.extend(sents)
コード例 #8
0
 def _get_corpus(self, fileid):
     results = dict()
     xmldoc = ElementTree.parse(fileid).getroot()
     for key, value in xmldoc.items():
         results[key] = value
     return results