コード例 #1
0
ファイル: flex.py プロジェクト: geoffbacon/qp
 def write(self, file, attributes, root_tag):
     """Writes to file."""
     if file == self._fileid:
         print("Warning: you were about to write over original file")
         return
     root = ElementTree.Element(root_tag)
     tree = _dict_to_xml(self.data, root, attributes)
     tree = ElementTree.ElementTree(tree)             
     tree.write(file, encoding='utf-8')
コード例 #2
0
ファイル: flex.py プロジェクト: geoffbacon/qp
def _dict_to_xml(d, element, attributes):
    """
    Build XML Element of all data in D.
    
    Attribute is list of keys in D that should be attributes.
    
    :type d: dict
    :param d: Dictionary to turn into XML
    :type element: str
    :param element: Tag of root element
    :type attributes: list(str)
    :param attributes: Keys that should be attributes, not children
    
    :rtpye: ElementTree
    :return: All information in D as XML
    """
    for key in d:
        if key == 'rtext':
            element.text = d[key]
        elif key in attributes:
            element.set(key, d[key])
        else:
            for child_dict in d[key]:
                subelement = ElementTree.SubElement(element, key)
                _dict_to_xml(child_dict, subelement, attributes)
    return element
コード例 #3
0
 def _get_sex(self, fileid, speaker):
     xmldoc = ElementTree.parse(fileid).getroot()
     for pat in xmldoc.findall('.//{%s}Participants/{%s}participant' %
                               (NS, NS)):
         try:
             if pat.get('id') == speaker:
                 sex = pat.get('sex')
                 return sex
         # some files don't have age data
         except (TypeError, AttributeError), e:
             return None
コード例 #4
0
ファイル: childes.py プロジェクト: mikabr/gender-input
 def _get_sex(self, fileid, speaker):
     xmldoc = ElementTree.parse(fileid).getroot()
     for pat in xmldoc.findall('.//{%s}Participants/{%s}participant'
                               % (NS,NS)):
         try:
             if pat.get('id') == speaker:
                 sex = pat.get('sex')
                 return sex
         # some files don't have age data
         except (TypeError, AttributeError), e:
             return None
コード例 #5
0
 def _get_age(self, fileid, speaker, month):
     xmldoc = ElementTree.parse(fileid).getroot()
     for pat in xmldoc.findall(".//{%s}Participants/{%s}participant" % (NS, NS)):
         try:
             if pat.get("id") == speaker:
                 age = pat.get("age")
                 if month:
                     age = self.convert_age(age)
                 return age
         # some files don't have age data
         except (TypeError, AttributeError) as e:
             return None
コード例 #6
0
ファイル: childes.py プロジェクト: rmalouf/nltk
 def _get_age(self, fileid, speaker, month):
     xmldoc = ElementTree.parse(fileid).getroot()
     for pat in xmldoc.findall('.//{%s}Participants/{%s}participant' % (NS, NS)):
         try:
             if pat.get('id') == speaker:
                 age = pat.get('age')
                 if month:
                     age = self.convert_age(age)
                 return age
         # some files don't have age data
         except (TypeError, AttributeError) as e:
             return None
コード例 #7
0
 def _get_age(self, fileid, speaker, month):
     xmldoc = ElementTree.parse(fileid).getroot()
     for pat in xmldoc.findall('.//{%s}Participants/{%s}participant' %
                               (NS, NS)):
         try:
             if pat.get('id') == speaker:
                 age = pat.get('age')
                 if month:
                     age = self.convert_age(age)
                 return (age)
         # some files don't have age data
         except:
             return (None)
コード例 #8
0
    def _get_participants(self, fileid):
        # multidimensional dicts
        def dictOfDicts():
            return defaultdict(dictOfDicts)

        xmldoc = ElementTree.parse(fileid).getroot()
        # getting participants' data
        pat = dictOfDicts()
        for participant in xmldoc.findall(
                './/{%s}Participants/{%s}participant' % (NS, NS)):
            for (key, value) in participant.items():
                pat[participant.get('id')][key] = value
        return pat
コード例 #9
0
ファイル: mychildes.py プロジェクト: langcog/alignment
    def _get_participants(self, fileid):
        # multidimensional dicts
        def dictOfDicts():
            return defaultdict(dictOfDicts)

        xmldoc = ElementTree.parse(fileid).getroot()
        # getting participants' data
        pat = dictOfDicts()
        for participant in xmldoc.findall('.//{%s}Participants/{%s}participant'
                                          % (NS,NS)):
            for (key,value) in participant.items():
                pat[participant.get('id')][key] = value
        return pat
コード例 #10
0
ファイル: NNCCorpus.py プロジェクト: grgprarup/oya-nepali-nlp
    def _words(self, fileid, bracket_sent, tag, strip_space, stem):
        """
        Helper used to implement the view methods -- returns a list of
        words or a list of sentences, optionally tagged.

        :param fileid: The name of the underlying file.
        :param bracket_sent: If true, include sentence bracketing.
        :param tag: The name of the tagset to use, or None for no tags.
        :param strip_space: If true, strip spaces from word tokens.
        :param stem: If true, then substitute stems for words.
        """
        result = []

        xmldoc = ElementTree.parse(fileid).getroot()
        for xmlsent in xmldoc.findall('.//s'):
            sent = []
            for xmlword in _all_xmlwords_in(xmlsent):
                word = xmlword.text
                if not word:
                    word = ""  # fixes issue 337?
                if strip_space or stem:
                    word = word.strip()
                if stem:
                    word = xmlword.get('hw', word)
                if tag == 'c5':
                    word = (word, xmlword.get('c5'))
                if tag == 'ctag':
                    word = (word, xmlword.get('ctag'))
                elif tag == 'pos':
                    word = (word, xmlword.get('pos', xmlword.get('c5')))
                sent.append(word)
            if bracket_sent:
                result.append(NNCSentence(xmlsent.attrib['n'], sent))
            else:
                result.extend(sent)

        assert None not in result
        return result
コード例 #11
0
ファイル: bnc.py プロジェクト: Journo-App/flask-by-example
    def _words(self, fileid, bracket_sent, tag, strip_space, stem):
        """
        Helper used to implement the view methods -- returns a list of
        words or a list of sentences, optionally tagged.

        :param fileid: The name of the underlying file.
        :param bracket_sent: If true, include sentence bracketing.
        :param tag: The name of the tagset to use, or None for no tags.
        :param strip_space: If true, strip spaces from word tokens.
        :param stem: If true, then substitute stems for words.
        """
        result = []

        xmldoc = ElementTree.parse(fileid).getroot()
        for xmlsent in xmldoc.findall('.//s'):
            sent = []
            for xmlword in _all_xmlwords_in(xmlsent):
                word = xmlword.text
                if not word:
                    word = ""  # fixes issue 337?
                if strip_space or stem:
                    word = word.strip()
                if stem:
                    word = xmlword.get('hw', word)
                if tag == 'c5':
                    word = (word, xmlword.get('c5'))
                elif tag == 'pos':
                    word = (word, xmlword.get('pos', xmlword.get('c5')))
                sent.append(word)
            if bracket_sent:
                result.append(BNCSentence(xmlsent.attrib['n'], sent))
            else:
                result.extend(sent)

        assert None not in result
        return result
コード例 #12
0
ファイル: mychildes.py プロジェクト: langcog/alignment
 def _get_words(self, fileid, speaker, sent, stem, relation, pos,
         strip_space, replace):
     if isinstance(speaker, string_types) and speaker != 'ALL':  # ensure we have a list of speakers
         speaker = [ speaker ]
     xmldoc = ElementTree.parse(fileid).getroot()
     # processing each xml doc
     results = []
     for xmlsent in xmldoc.findall('.//{%s}u' % NS):
         sents = [xmlsent.get('who')]
         # select speakers
         if speaker == 'ALL' or xmlsent.get('who') in speaker:
             for xmlword in xmlsent.findall('.//{%s}w' % NS):
                 infl = None ; suffixStem = None
                 # getting replaced words
                 if replace and xmlsent.find('.//{%s}w/{%s}replacement'
                                             % (NS,NS)):
                     xmlword = xmlsent.find('.//{%s}w/{%s}replacement/{%s}w'
                                            % (NS,NS,NS))
                 elif replace and xmlsent.find('.//{%s}w/{%s}wk' % (NS,NS)):
                     xmlword = xmlsent.find('.//{%s}w/{%s}wk' % (NS,NS))
                 # get text
                 if xmlword.text:
                     word = xmlword.text
                 else:
                     word = ''
                 # strip tailing space
                 if strip_space:
                     word = word.strip()
                 # stem
                 if relation or stem:
                     try:
                         xmlstem = xmlword.find('.//{%s}stem' % NS)
                         word = xmlstem.text
                     except AttributeError as e:
                         pass
                     # if there is an inflection
                     try:
                         xmlinfl = xmlword.find('.//{%s}mor/{%s}mw/{%s}mk'
                                                % (NS,NS,NS))
                         word += '-' + xmlinfl.text
                     except:
                         pass
                     # if there is a suffix
                     try:
                         xmlsuffix = xmlword.find('.//{%s}mor/{%s}mor-post/{%s}mw/{%s}stem'
                                                  % (NS,NS,NS,NS))
                         suffixStem = xmlsuffix.text
                     except AttributeError:
                         suffixStem = ""
                 # pos
                 if relation or pos:
                     try:
                         xmlpos = xmlword.findall(".//{%s}c" % NS)
                         xmlpos2 = xmlword.findall(".//{%s}s" % NS)
                         if xmlpos2 != []:
                             tag = xmlpos[0].text+":"+xmlpos2[0].text
                         else:
                             tag = xmlpos[0].text
                         word = (word,tag)
                     except (AttributeError,IndexError) as e:
                         word = (word,None)
                         if suffixStem:
                             suffixStem = (suffixStem,None)
                 # relational
                 # the gold standard is stored in
                 # <mor></mor><mor type="trn"><gra type="grt">
                 if relation == True:
                     for xmlstem_rel in xmlword.findall('.//{%s}mor/{%s}gra'
                                                        % (NS,NS)):
                         if not xmlstem_rel.get('type') == 'grt':
                             word = (word[0], word[1],
                                     xmlstem_rel.get('index')
                                     + "|" + xmlstem_rel.get('head')
                                     + "|" + xmlstem_rel.get('relation'))
                         else:
                             word = (word[0], word[1], word[2],
                                     word[0], word[1],
                                     xmlstem_rel.get('index')
                                     + "|" + xmlstem_rel.get('head')
                                     + "|" + xmlstem_rel.get('relation'))
                     try:
                         for xmlpost_rel in xmlword.findall('.//{%s}mor/{%s}mor-post/{%s}gra'
                                                            % (NS,NS,NS)):
                             if not xmlpost_rel.get('type') == 'grt':
                                 suffixStem = (suffixStem[0],
                                               suffixStem[1],
                                               xmlpost_rel.get('index')
                                               + "|" + xmlpost_rel.get('head')
                                               + "|" + xmlpost_rel.get('relation'))
                             else:
                                 suffixStem = (suffixStem[0], suffixStem[1],
                                               suffixStem[2], suffixStem[0],
                                               suffixStem[1],
                                               xmlpost_rel.get('index')
                                               + "|" + xmlpost_rel.get('head')
                                               + "|" + xmlpost_rel.get('relation'))
                     except:
                         pass
                 sents.append(word)
                 if suffixStem:
                     sents.append(suffixStem)
             if sent or relation:
                 results.append(sents)
             else:
                 results.extend(sents)
     return results
コード例 #13
0
    def get_custom_sents(
            self, fileid
    ):  # speaker, sent, stem, relation, pos, strip_space, replace):
        fileid = self.abspaths([fileid])[0]
        tree = ElementTree.parse(fileid)
        xmldoc = tree.getroot()

        # check if this file has phonological transcriptions
        if xmldoc.find('.//{%s}pw' % NS) is not None:
            fileHasPhonology = True
            print('File has phonological transcripts. Processing...')
        else:
            fileHasPhonology = False
            print(
                'File has no phonological transcripts. Skipping extraction of phonological information.'
            )

        results2 = []
        for xmlsent in xmldoc.findall('.//{%s}u' % NS):

            # TODO confusing tuple structure, use map

            utt = ()

            sentID = xmlsent.get('uID')
            sents = []
            # place this in map
            speaker = xmlsent.get('who')  # ME

            utt += (sentID, speaker)

            tokens = []

            token_order = 0

            skip_replacement_counter = 0

            # extract utterance terminator
            terminator = xmlsent.find(".//{%s}t" % NS).attrib['type']
            utt += (terminator, )

            # get dependent tiers / annotations
            # TODO get a bunch of stuff and return in convenient format
            annotations = []
            annotation_elements = xmlsent.findall(".//{%s}a" % NS)
            for element in annotation_elements:
                annotation = {}
                annotation['type'] = element.attrib.get('type')
                annotation['flavor'] = element.attrib.get('flavor')
                annotation['who'] = element.attrib.get('who')
                annotation['text'] = element.text
                annotations.append(annotation)

            utt += (annotations, )
            # does this capture the phonetic tier?

            # extract media info, if it exists
            media = {}
            media_element = xmlsent.findall(".//{%s}media" % NS)

            if media_element:
                media['start'] = media_element[0].attrib['start']
                media['end'] = media_element[0].attrib['end']
                media['unit'] = media_element[0].attrib['unit']

            utt += (media, )

            # Pull out the phonology tiers
            if fileHasPhonology:
                actual_pho, model_pho = get_phonology(xmlsent, speaker, sentID,
                                                      fileid)
                num_tokens = len(xmlsent.findall('.//{%s}w' % NS))
                include_actual_pho = num_tokens == len(actual_pho)
                include_model_pho = num_tokens == len(model_pho)
            else:
                actual_pho = []
                model_pho = []

            for xmlword in xmlsent.findall('.//{%s}w' % NS):

                # skip the replacements of a word - they've already been considered
                if skip_replacement_counter > 0:
                    skip_replacement_counter -= 1
                    continue

                token = {}

                if xmlword.get('type') == 'omission':
                    continue

                suffixStem = None

                #xstr = lambda s: "" if s is None else unicode(s)
                xstr = lambda s: "" if s is None else s

                if xmlword.find('.//{%s}langs' % (NS)):
                    xmlword.text = xmlword.find('.//{%s}langs' % (NS)).tail

                # handles compounds and ignores shortenings (?)
                text_tags = [
                    "{%s}wk" % NS,
                    "{%s}p" % NS,
                    "{%s}shortening" % NS
                ]
                if xmlword.findall('*'):
                    word_tags = xmlword.findall('*')
                    text = xstr(xmlword.text)
                    for word_tag in word_tags:
                        if word_tag.tag in text_tags:
                            if word_tag.tag == "{%s}wk" % NS:
                                text += "+"
                            text += xstr(word_tag.text) + xstr(word_tag.tail)
                    xmlword.text = text

                if xmlword.text:
                    word = xmlword.text
                    token['gloss'] = xmlword.text.strip()
                else:
                    print('empty word in sentence ' + str(sentID))
                    word = ''
                    token['gloss'] = ''

                # check if this is a replacement, and then build rep, stem, etc from children
                if xmlword.find('.//{%s}replacement' % (NS)):
                    # save children in replacement field
                    # iterate over children
                    replacements = []
                    prefix = []
                    pos = []
                    stems = []
                    suffix = []
                    english = []
                    clitics = []
                    relations = []
                    morpheme_length = None
                    children = xmlword.findall('.//{%s}w' % NS)
                    for child in children:
                        if child.text:
                            replacements.append(child.text)

                        prefix_result, pos_result, stem_result, suffix_result, english_result, clitic_result, morpheme_length_result = \
                            self._get_morphology(child)

                        if prefix_result:
                            prefix.append(prefix_result)

                        # pos_result = self._get_pos(child, None)
                        if pos_result:
                            pos.append(pos_result)

                        # stem_result = self._get_stem(child)
                        if stem_result:
                            stems.append(stem_result)

                        if suffix_result:
                            suffix.append(suffix_result)

                        if english_result:
                            english.append(english_result)

                        if clitic_result:
                            clitics.append(clitic_result)

                        relation_result = self._get_relation(child)
                        if relation_result:
                            relations.append(relation_result)

                        if morpheme_length_result:
                            if morpheme_length:
                                morpheme_length += morpheme_length_result
                            else:
                                morpheme_length = morpheme_length_result

                    token['replacement'] = ' '.join(replacements)
                    token['prefix'] = ' '.join(prefix)
                    token['pos'] = ' '.join(pos)
                    token['stem'] = ' '.join(stems)
                    token['suffix'] = ' '.join(suffix)
                    token['english'] = ' '.join(english)
                    token['clitic'] = ' '.join(clitics)
                    token['relation'] = ' '.join(relations)
                    token['morpheme_length'] = morpheme_length

                    skip_replacement_counter = len(children)
                else:  # else get stem and pos for this word
                    # word = word.strip()

                    token['prefix'], token['pos'], token['stem'], token['suffix'], token['english'], token['clitic'], token['morpheme_length'] = \
                        self._get_morphology(xmlword)

                    # token['stem'] = self._get_stem(xmlword)  # if suffix, should be in same column
                    # token['pos'] = self._get_pos(xmlword, suffixStem)
                    token['relation'] = self._get_relation(xmlword)

                    # replacement_elems = filter(lambda x: x.tag == '{%s}w' % NS, [e for e in xmlword.iter() if e is not xmlword])
                    # replacements = [r.text for r in replacement_elems]
                    # replacement_str = ' '.join(replacements)
                    # if replacement_str:
                    #     token['replacement'] = replacement_str
                    #     skip_replacement_counter = len(replacements)
                # parent_map = dict((c, p) for p in tree.getiterator() for c in p)
                #
                # if parent_map.get(xmlword) and parent_map.get(xmlword).tag == '{%s}replacement' % NS:
                #     last_token = tokens[len(tokens) - 1]
                #     last_token['replacement'] = token['gloss']
                #     continue # don't save this token in tokens array

                # strip tailing space
                token_order += 1
                token['order'] = token_order

                # only include the phonetic information at the word level if it aligns with the set of words
                if fileHasPhonology:
                    if include_actual_pho:
                        token['pho'] = actual_pho[(token_order - 1)]
                    else:
                        # mismatch in actual_pho and utterance length; not including actual pho at the word level
                        token['pho'] = ''

                    if include_model_pho:
                        token['mod'] = model_pho[(token_order - 1)]
                    else:
                        # mismatch in model_pho and utterance length; not including model pho at the word level
                        token['mod'] = ''
                else:
                    # whole file does not have phonology
                    token['pho'] = ''
                    token['mod'] = ''

                tokens.append(token)
                # if suffixStem:
                #     sents.append(suffixStem)
            results2.append(utt + (tokens, ) + (actual_pho, ) + (model_pho, ))
        return results2
コード例 #14
0
ファイル: mychildes.py プロジェクト: langcog/alignment
 def _get_corpus(self, fileid):
     results = dict()
     xmldoc = ElementTree.parse(fileid).getroot()
     for key, value in xmldoc.items():
         results[key] = value
     return results
コード例 #15
0
    def _get_words(self, fileid, speaker, sent, stem, relation, pos,
                   strip_space, replace):
        if isinstance(
                speaker,
                str) and speaker != 'ALL':  # ensure we have a list of speakers
            speaker = [speaker]
        xmldoc = ElementTree.parse(fileid).getroot()
        # processing each xml doc
        results = []
        for xmlsent in xmldoc.findall('.//{%s}u' % NS):
            sentID = xmlsent.get('uID')
            sents = []
            # select speakers
            if speaker == 'ALL' or xmlsent.get('who') in speaker:
                for xmlword in xmlsent.findall('.//{%s}w' % NS):

                    if xmlword.get('type') == 'omission':
                        continue

                    infl = None
                    suffixStem = None

                    # getting replaced words
                    xstr = lambda s: "" if s is None else unicode(s)
                    if replace and xmlword.find('.//{%s}replacement' % (NS)):
                        continue

                    if xmlword.find('.//{%s}langs' % (NS)):
                        xmlword.text = xmlword.find('.//{%s}langs' % (NS)).tail

                    text_tags = [
                        "{%s}wk" % NS,
                        "{%s}p" % NS,
                        "{%s}shortening" % NS
                    ]
                    if xmlword.findall('*'):
                        word_tags = xmlword.findall('*')
                        text = xstr(xmlword.text)
                        for word_tag in word_tags:
                            if word_tag.tag in text_tags:
                                if word_tag.tag == "{%s}wk" % NS:
                                    text += "+"
                                text += xstr(word_tag.text) + xstr(
                                    word_tag.tail)
                        xmlword.text = text

                    if xmlword.text:
                        word = xmlword.text
                    else:
                        print 'empty word in sentence %s' % sentID
                        word = ''

                    # strip tailing space
                    if strip_space:
                        word = word.strip()

                    # stem
                    if relation or stem:
                        try:
                            xmlstem = xmlword.find('.//{%s}stem' % NS)
                            word = xmlstem.text
                        except AttributeError, e:
                            pass
                        # if there is an inflection
                        try:
                            xmlinfl = xmlword.find('.//{%s}mor/{%s}mw/{%s}mk' %
                                                   (NS, NS, NS))
                            word += '-' + xmlinfl.text
                        except:
                            pass
                        # if there is a suffix
                        try:
                            xmlsuffix = xmlword.find(
                                './/{%s}mor/{%s}mor-post/{%s}mw/{%s}stem' %
                                (NS, NS, NS, NS))
                            suffixStem = xmlsuffix.text
                        except AttributeError:
                            suffixStem = ""
                    # pos
                    if relation or pos:
                        try:
                            xmlpos = xmlword.findall(".//{%s}c" % NS)
                            word = (word, xmlpos[0].text)
                            if len(xmlpos) != 1 and suffixStem:
                                suffixStem = (suffixStem, xmlpos[1].text)
                        except (AttributeError, IndexError), e:
                            word = (word, None)
                            if suffixStem:
                                suffixStem = (suffixStem, None)
                    # relational
                    # the gold standard is stored in
                    # <mor></mor><mor type="trn"><gra type="grt">
                    if relation == True:
                        for xmlstem_rel in xmlword.findall(
                                './/{%s}mor/{%s}gra' % (NS, NS)):
                            if not xmlstem_rel.get('type') == 'grt':
                                word = (word[0], word[1],
                                        xmlstem_rel.get('index') + "|" +
                                        xmlstem_rel.get('head') + "|" +
                                        xmlstem_rel.get('relation'))
                            else:
                                word = (word[0], word[1], word[2], word[0],
                                        word[1], xmlstem_rel.get('index') +
                                        "|" + xmlstem_rel.get('head') + "|" +
                                        xmlstem_rel.get('relation'))
                        try:
                            for xmlpost_rel in xmlword.findall(
                                    './/{%s}mor/{%s}mor-post/{%s}gra' %
                                (NS, NS, NS)):
                                if not xmlpost_rel.get('type') == 'grt':
                                    suffixStem = (suffixStem[0], suffixStem[1],
                                                  xmlpost_rel.get('index') +
                                                  "|" +
                                                  xmlpost_rel.get('head') +
                                                  "|" +
                                                  xmlpost_rel.get('relation'))
                                else:
                                    suffixStem = (suffixStem[0], suffixStem[1],
                                                  suffixStem[2], suffixStem[0],
                                                  suffixStem[1],
                                                  xmlpost_rel.get('index') +
                                                  "|" +
                                                  xmlpost_rel.get('head') +
                                                  "|" +
                                                  xmlpost_rel.get('relation'))
                        except:
                            pass
                    sents.append(word)
                    if suffixStem:
                        sents.append(suffixStem)
                if sent or relation:
                    results.append(sents)
                else:
                    results.extend(sents)
コード例 #16
0
    def get_custom_sents(
            self, fileid
    ):  # speaker, sent, stem, relation, pos, strip_space, replace):
        fileid = self.abspaths([fileid])[0]
        tree = ElementTree.parse(fileid)
        xmldoc = tree.getroot()
        # processing each xml doc
        # results = []
        results2 = []
        for xmlsent in xmldoc.findall('.//{%s}u' % NS):

            utt = ()

            sentID = xmlsent.get('uID')
            sents = []
            # place this in map
            speaker = xmlsent.get('who')  # ME

            utt += (sentID, speaker)

            tokens = []

            token_order = 0

            skip_replacement_counter = 0

            for xmlword in xmlsent.findall('.//{%s}w' % NS):

                # skip the replacements of a word - they've already been considered
                if skip_replacement_counter > 0:
                    skip_replacement_counter -= 1
                    continue

                token = {}

                if xmlword.get('type') == 'omission':
                    continue

                suffixStem = None

                xstr = lambda s: "" if s is None else unicode(s)

                if xmlword.find('.//{%s}langs' % (NS)):
                    xmlword.text = xmlword.find('.//{%s}langs' % (NS)).tail

                # handles compounds and ignores shortenings (?)
                text_tags = [
                    "{%s}wk" % NS,
                    "{%s}p" % NS,
                    "{%s}shortening" % NS
                ]
                if xmlword.findall('*'):
                    word_tags = xmlword.findall('*')
                    text = xstr(xmlword.text)
                    for word_tag in word_tags:
                        if word_tag.tag in text_tags:
                            if word_tag.tag == "{%s}wk" % NS:
                                text += "+"
                            text += xstr(word_tag.text) + xstr(word_tag.tail)
                    xmlword.text = text

                if xmlword.text:
                    word = xmlword.text
                    token['gloss'] = xmlword.text.strip()
                else:
                    print 'empty word in sentence %s' % sentID
                    word = ''
                    token['gloss'] = ''

                # check if this is a replacement, and then build rep, stem, etc from children
                if xmlword.find('.//{%s}replacement' % (NS)):
                    # save children in replacement field
                    # iterate over children
                    replacements = []
                    stems = []
                    pos = []
                    relations = []
                    children = xmlword.findall('.//{%s}w' % NS)
                    for child in children:
                        replacements.append(child.text)
                        stems.append(self._get_stem(child))
                        pos.append(self._get_pos(child, None))
                        relations.append(self._get_relation(child))
                    token['replacement'] = ' '.join(replacements)
                    token['stem'] = ' '.join(stems)
                    token['pos'] = ' '.join(pos)
                    token['relation'] = ' '.join(relations)

                    skip_replacement_counter = len(children)
                else:  # else get stem and pos for this word
                    # word = word.strip()
                    token['stem'] = self._get_stem(
                        xmlword)  # if suffix, should be in same column
                    token['pos'] = self._get_pos(xmlword, suffixStem)
                    token['relation'] = self._get_relation(xmlword)
                    # replacement_elems = filter(lambda x: x.tag == '{%s}w' % NS, [e for e in xmlword.iter() if e is not xmlword])
                    # replacements = [r.text for r in replacement_elems]
                    # replacement_str = ' '.join(replacements)
                    # if replacement_str:
                    #     token['replacement'] = replacement_str
                    #     skip_replacement_counter = len(replacements)
                # parent_map = dict((c, p) for p in tree.getiterator() for c in p)
                #
                # if parent_map.get(xmlword) and parent_map.get(xmlword).tag == '{%s}replacement' % NS:
                #     last_token = tokens[len(tokens) - 1]
                #     last_token['replacement'] = token['gloss']
                #     continue # don't save this token in tokens array

                # strip tailing space
                token_order += 1
                token['order'] = token_order

                # sents.append(word)
                tokens.append(token)
                # if suffixStem:
                #     sents.append(suffixStem)
            results2.append(utt + (tokens, ))
        return results2
コード例 #17
0
            for overlap in gwrap.findall('.//{%s}overlap' % NS):
                if overlap.get('type') == type:
                    return True
    return False


def getAgeFromFileName(file):
    chunks = file.split('-')
    year = chunks[0]
    month = chunks[1]
    months = (int(year) * 12) + int(month)
    return months


for file in thomas.fileids():
    xmldoc = ElementTree.parse(file).getroot()
    results = []
    with open('thomas_utterance_data.csv', 'a') as csvfile:
        fieldnames = [
            'utterance', 'response', 'speaker', 'responder', 'error', 'age',
            'past_tense', 'plural', 'source_file', 'utterance_start',
            'utterance_end', 'response_start', 'response_end'
        ]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        i = 0
        sents = xmldoc.findall('.//{%s}u' % NS)
        while i + 1 < len(sents):
            data = {}
            data['utterance'] = getUtterance(sents[i])
            data['response'] = getUtterance(sents[i + 1])
            data['speaker'] = sents[i].get('who')
コード例 #18
0
import nltk
from nltk.corpus.reader.xmldocs import ElementTree

NS = "http://www.talkbank.org/ns/talkbank"
xstr = lambda s: "" if s is None else str(s)
corpus_root = nltk.data.find("corpora/childes/data-xml/English/Eng-NA-MOR/")

reload(childes)
from childes import CHILDESCorpusReader

corpus_reader = CHILDESCorpusReader(corpus_root, r"Providence/William/wil11.xml")
words = corpus_reader.words(replace=True)
freqs = nltk.FreqDist(words)

fileid = corpus_reader.fileids()[0]
xmldoc = ElementTree.parse(corpus_root + "/" + fileid).getroot()
xmlsents = xmldoc.findall(".//{%s}u" % NS)

xmlsent = xmlsents[173]
xmlwords = xmlsent.findall(".//{%s}w" % NS)
xmlword = xmlwords[2]

shortenings = xmlword.findall(".//{%s}shortening" % (NS))
text = xstr(xmlword.text) + "".join([xstr(short.text) + xstr(short.tail) for short in shortenings])
コード例 #19
0
 def _get_corpus(self, fileid):
     results = dict()
     xmldoc = ElementTree.parse(fileid).getroot()
     for key, value in xmldoc.items():
         results[key] = value
     return results
コード例 #20
0
 def _get_words(self, fileid, speaker, sent, stem, relation, pos,
                strip_space, replace):
     if isinstance(
             speaker, string_types
     ) and speaker != 'ALL':  # ensure we have a list of speakers
         speaker = [speaker]
     xmldoc = ElementTree.parse(fileid).getroot()
     # processing each xml doc
     results = []
     for xmlsent in xmldoc.findall('.//{%s}u' % NS):
         sents = []
         # select speakers
         if speaker == 'ALL' or xmlsent.get('who') in speaker:
             for xmlword in xmlsent.findall('.//{%s}w' % NS):
                 infl = None
                 suffixStem = None
                 # getting replaced words
                 if replace and xmlsent.find('.//{%s}w/{%s}replacement' %
                                             (NS, NS)):
                     xmlword = xmlsent.find(
                         './/{%s}w/{%s}replacement/{%s}w' % (NS, NS, NS))
                 elif replace and xmlsent.find('.//{%s}w/{%s}wk' %
                                               (NS, NS)):
                     xmlword = xmlsent.find('.//{%s}w/{%s}wk' % (NS, NS))
                 # get text
                 if xmlword.text:
                     word = xmlword.text
                 else:
                     word = ''
                 # strip tailing space
                 if strip_space:
                     word = word.strip()
                 # stem
                 if relation or stem:
                     try:
                         xmlstem = xmlword.find('.//{%s}stem' % NS)
                         word = xmlstem.text
                     except AttributeError as e:
                         pass
                     # if there is an inflection
                     try:
                         xmlinfl = xmlword.find('.//{%s}mor/{%s}mw/{%s}mk' %
                                                (NS, NS, NS))
                         word += '-' + xmlinfl.text
                     except:
                         pass
                     # if there is a suffix
                     try:
                         xmlsuffix = xmlword.find(
                             './/{%s}mor/{%s}mor-post/{%s}mw/{%s}stem' %
                             (NS, NS, NS, NS))
                         suffixStem = xmlsuffix.text
                     except AttributeError:
                         suffixStem = ""
                 # pos
                 if relation or pos:
                     try:
                         xmlpos = xmlword.findall(".//{%s}c" % NS)
                         xmlpos2 = xmlword.findall(".//{%s}s" % NS)
                         if xmlpos2 != []:
                             tag = xmlpos[0].text + ":" + xmlpos2[0].text
                         else:
                             tag = xmlpos[0].text
                         word = (word, tag)
                     except (AttributeError, IndexError) as e:
                         word = (word, None)
                         if suffixStem:
                             suffixStem = (suffixStem, None)
                 # relational
                 # the gold standard is stored in
                 # <mor></mor><mor type="trn"><gra type="grt">
                 if relation == True:
                     for xmlstem_rel in xmlword.findall(
                             './/{%s}mor/{%s}gra' % (NS, NS)):
                         if not xmlstem_rel.get('type') == 'grt':
                             word = (word[0], word[1],
                                     xmlstem_rel.get('index') + "|" +
                                     xmlstem_rel.get('head') + "|" +
                                     xmlstem_rel.get('relation'))
                         else:
                             word = (word[0], word[1], word[2], word[0],
                                     word[1], xmlstem_rel.get('index') +
                                     "|" + xmlstem_rel.get('head') + "|" +
                                     xmlstem_rel.get('relation'))
                     try:
                         for xmlpost_rel in xmlword.findall(
                                 './/{%s}mor/{%s}mor-post/{%s}gra' %
                             (NS, NS, NS)):
                             if not xmlpost_rel.get('type') == 'grt':
                                 suffixStem = (suffixStem[0], suffixStem[1],
                                               xmlpost_rel.get('index') +
                                               "|" +
                                               xmlpost_rel.get('head') +
                                               "|" +
                                               xmlpost_rel.get('relation'))
                             else:
                                 suffixStem = (suffixStem[0], suffixStem[1],
                                               suffixStem[2], suffixStem[0],
                                               suffixStem[1],
                                               xmlpost_rel.get('index') +
                                               "|" +
                                               xmlpost_rel.get('head') +
                                               "|" +
                                               xmlpost_rel.get('relation'))
                     except:
                         pass
                 sents.append(word)
                 if suffixStem:
                     sents.append(suffixStem)
             if sent or relation:
                 results.append(sents)
             else:
                 results.extend(sents)
     return results
コード例 #21
0
    def _get_words(self, fileid, speaker, sent, stem, relation, pos,
                   strip_space, replace):

        # ensure we have a list of speakers
        if isinstance(speaker, string_types) and speaker != 'ALL':
            speaker = [speaker]

        xmldoc = ElementTree.parse(fileid).getroot()

        # processing each sentence in xml doc
        results = []
        for xmlsent in xmldoc.findall(sent_node):
            sents = []

            # select speakers
            if speaker == 'ALL' or xmlsent.get('who') in speaker:

                # process each word
                for xml_word in xmlsent.findall(word_node):
                    clitic_stem = None

                    # get replaced words
                    if replace:
                        xml_word = get_replaced_word(xmlsent, xml_word)

                    # get text
                    if xml_word.text:
                        word = xml_word.text
                    else:
                        word = ''

                    # strip tailing space
                    if strip_space:
                        word = word.strip()

                    # get stemmed words
                    if stem:
                        try:
                            xmlstem = xml_word.find(stem_node)
                            word = xmlstem.text
                        except AttributeError:
                            pass

                        # if there is an inflection
                        try:
                            word = add_inflection(xml_word, word)
                        except:
                            pass

                        # if there is a clitic
                        try:
                            xmlclitic = xml_word.find(clitic_node)
                            clitic_stem = xmlclitic.text
                        except AttributeError:
                            clitic_stem = ''

                    # get pos
                    if pos:
                        try:
                            tag = get_pos_tag(xml_word)
                            word = (word, tag)
                        except (AttributeError, IndexError):
                            word = (word, None)

                        if clitic_stem:
                            # add clitic's pos tag if there is one
                            # in the parent class method, this branch does not fetch the clitic -- this is changed here
                            clitic_pos = xml_word.find(clitic_pos_tag_node)
                            if clitic_pos is not None:
                                clitic_stem = (clitic_stem,
                                               clitic_pos.get('relation'))
                            else:
                                clitic_stem = (clitic_stem, None)

                    sents.append(word)
                    if clitic_stem:
                        sents.append(clitic_stem)
                if sent:
                    results.append(sents)
                else:
                    results.extend(sents)

        return results
コード例 #22
0
ファイル: childes.py プロジェクト: mikabr/gender-input
    def _get_words(self, fileid, speaker, sent, stem, relation, pos,
            strip_space, replace):
        if isinstance(speaker, str) and speaker != 'ALL':  # ensure we have a list of speakers
            speaker = [ speaker ]
        xmldoc = ElementTree.parse(fileid).getroot()
        # processing each xml doc
        results = []
        for xmlsent in xmldoc.findall('.//{%s}u' % NS):
            sentID  = xmlsent.get('uID')
            sents = []
            # select speakers
            if speaker == 'ALL' or xmlsent.get('who') in speaker:
                for xmlword in xmlsent.findall('.//{%s}w' % NS):

                    if xmlword.get('type') == 'omission':
                        continue

                    infl = None ; suffixStem = None

                    # getting replaced words
                    xstr = lambda s: "" if s is None else unicode(s)
                    if replace and xmlword.find('.//{%s}replacement' % (NS)):
                        continue

                    if xmlword.find('.//{%s}langs' % (NS)):
                        xmlword.text = xmlword.find('.//{%s}langs' % (NS)).tail

                    text_tags = ["{%s}wk" % NS, "{%s}p" % NS, "{%s}shortening" % NS]
                    if xmlword.findall('*'):
                        word_tags = xmlword.findall('*')
                        text = xstr(xmlword.text)
                        for word_tag in word_tags:
                            if word_tag.tag in text_tags:
                                if word_tag.tag == "{%s}wk" % NS:
                                    text += "+"
                                text += xstr(word_tag.text) + xstr(word_tag.tail)
                        xmlword.text = text

                    if xmlword.text:
                        word = xmlword.text
                    else:
                        print 'empty word in sentence %s' % sentID
                        word = ''

                    # strip tailing space
                    if strip_space:
                        word = word.strip()

                    # stem
                    if relation or stem:
                        try:
                            xmlstem = xmlword.find('.//{%s}stem' % NS)
                            word = xmlstem.text
                        except AttributeError, e:
                            pass
                        # if there is an inflection
                        try:
                            xmlinfl = xmlword.find('.//{%s}mor/{%s}mw/{%s}mk'
                                                   % (NS,NS,NS))
                            word += '-' + xmlinfl.text
                        except:
                            pass
                        # if there is a suffix
                        try:
                            xmlsuffix = xmlword.find('.//{%s}mor/{%s}mor-post/{%s}mw/{%s}stem'
                                                     % (NS,NS,NS,NS))
                            suffixStem = xmlsuffix.text
                        except AttributeError:
                            suffixStem = ""
                    # pos
                    if relation or pos:
                        try:
                            xmlpos = xmlword.findall(".//{%s}c" % NS)
                            word = (word,xmlpos[0].text)
                            if len(xmlpos) != 1 and suffixStem:
                                suffixStem = (suffixStem,xmlpos[1].text)
                        except (AttributeError,IndexError), e:
                            word = (word,None)
                            if suffixStem:
                                suffixStem = (suffixStem,None)
                    # relational
                    # the gold standard is stored in
                    # <mor></mor><mor type="trn"><gra type="grt">
                    if relation == True:
                        for xmlstem_rel in xmlword.findall('.//{%s}mor/{%s}gra'
                                                           % (NS,NS)):
                            if not xmlstem_rel.get('type') == 'grt':
                                word = (word[0], word[1],
                                        xmlstem_rel.get('index')
                                        + "|" + xmlstem_rel.get('head')
                                        + "|" + xmlstem_rel.get('relation'))
                            else:
                                word = (word[0], word[1], word[2],
                                        word[0], word[1],
                                        xmlstem_rel.get('index')
                                        + "|" + xmlstem_rel.get('head')
                                        + "|" + xmlstem_rel.get('relation'))
                        try:
                            for xmlpost_rel in xmlword.findall('.//{%s}mor/{%s}mor-post/{%s}gra'
                                                               % (NS,NS,NS)):
                                if not xmlpost_rel.get('type') == 'grt':
                                    suffixStem = (suffixStem[0],
                                                  suffixStem[1],
                                                  xmlpost_rel.get('index')
                                                  + "|" + xmlpost_rel.get('head')
                                                  + "|" + xmlpost_rel.get('relation'))
                                else:
                                    suffixStem = (suffixStem[0], suffixStem[1],
                                                  suffixStem[2], suffixStem[0],
                                                  suffixStem[1],
                                                  xmlpost_rel.get('index')
                                                  + "|" + xmlpost_rel.get('head')
                                                  + "|" + xmlpost_rel.get('relation'))
                        except:
                            pass
                    sents.append(word)
                    if suffixStem:
                        sents.append(suffixStem)
                if sent or relation:
                    results.append(sents)
                else:
                    results.extend(sents)
コード例 #23
0
ファイル: childes.py プロジェクト: zlpmichelle/nltk
 def _get_words(self, fileid, speaker, sent, stem, relation, pos,
                strip_space, replace):
     if (isinstance(speaker, string_types)
             and speaker != "ALL"):  # ensure we have a list of speakers
         speaker = [speaker]
     xmldoc = ElementTree.parse(fileid).getroot()
     # processing each xml doc
     results = []
     for xmlsent in xmldoc.findall(".//{%s}u" % NS):
         sents = []
         # select speakers
         if speaker == "ALL" or xmlsent.get("who") in speaker:
             for xmlword in xmlsent.findall(".//{%s}w" % NS):
                 infl = None
                 suffixStem = None
                 suffixTag = None
                 # getting replaced words
                 if replace and xmlsent.find(".//{%s}w/{%s}replacement" %
                                             (NS, NS)):
                     xmlword = xmlsent.find(
                         ".//{%s}w/{%s}replacement/{%s}w" % (NS, NS, NS))
                 elif replace and xmlsent.find(".//{%s}w/{%s}wk" %
                                               (NS, NS)):
                     xmlword = xmlsent.find(".//{%s}w/{%s}wk" % (NS, NS))
                 # get text
                 if xmlword.text:
                     word = xmlword.text
                 else:
                     word = ""
                 # strip tailing space
                 if strip_space:
                     word = word.strip()
                 # stem
                 if relation or stem:
                     try:
                         xmlstem = xmlword.find(".//{%s}stem" % NS)
                         word = xmlstem.text
                     except AttributeError as e:
                         pass
                     # if there is an inflection
                     try:
                         xmlinfl = xmlword.find(".//{%s}mor/{%s}mw/{%s}mk" %
                                                (NS, NS, NS))
                         word += "-" + xmlinfl.text
                     except:
                         pass
                     # if there is a suffix
                     try:
                         xmlsuffix = xmlword.find(
                             ".//{%s}mor/{%s}mor-post/{%s}mw/{%s}stem" %
                             (NS, NS, NS, NS))
                         suffixStem = xmlsuffix.text
                     except AttributeError:
                         suffixStem = ""
                     if suffixStem:
                         word += "~" + suffixStem
                 # pos
                 if relation or pos:
                     try:
                         xmlpos = xmlword.findall(".//{%s}c" % NS)
                         xmlpos2 = xmlword.findall(".//{%s}s" % NS)
                         if xmlpos2 != []:
                             tag = xmlpos[0].text + ":" + xmlpos2[0].text
                         else:
                             tag = xmlpos[0].text
                     except (AttributeError, IndexError) as e:
                         tag = ""
                     try:
                         xmlsuffixpos = xmlword.findall(
                             ".//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}c"
                             % (NS, NS, NS, NS, NS))
                         xmlsuffixpos2 = xmlword.findall(
                             ".//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}s"
                             % (NS, NS, NS, NS, NS))
                         if xmlsuffixpos2:
                             suffixTag = (xmlsuffixpos[0].text + ":" +
                                          xmlsuffixpos2[0].text)
                         else:
                             suffixTag = xmlsuffixpos[0].text
                     except:
                         pass
                     if suffixTag:
                         tag += "~" + suffixTag
                     word = (word, tag)
                 # relational
                 # the gold standard is stored in
                 # <mor></mor><mor type="trn"><gra type="grt">
                 if relation == True:
                     for xmlstem_rel in xmlword.findall(
                             ".//{%s}mor/{%s}gra" % (NS, NS)):
                         if not xmlstem_rel.get("type") == "grt":
                             word = (
                                 word[0],
                                 word[1],
                                 xmlstem_rel.get("index") + "|" +
                                 xmlstem_rel.get("head") + "|" +
                                 xmlstem_rel.get("relation"),
                             )
                         else:
                             word = (
                                 word[0],
                                 word[1],
                                 word[2],
                                 word[0],
                                 word[1],
                                 xmlstem_rel.get("index") + "|" +
                                 xmlstem_rel.get("head") + "|" +
                                 xmlstem_rel.get("relation"),
                             )
                     try:
                         for xmlpost_rel in xmlword.findall(
                                 ".//{%s}mor/{%s}mor-post/{%s}gra" %
                             (NS, NS, NS)):
                             if not xmlpost_rel.get("type") == "grt":
                                 suffixStem = (
                                     suffixStem[0],
                                     suffixStem[1],
                                     xmlpost_rel.get("index") + "|" +
                                     xmlpost_rel.get("head") + "|" +
                                     xmlpost_rel.get("relation"),
                                 )
                             else:
                                 suffixStem = (
                                     suffixStem[0],
                                     suffixStem[1],
                                     suffixStem[2],
                                     suffixStem[0],
                                     suffixStem[1],
                                     xmlpost_rel.get("index") + "|" +
                                     xmlpost_rel.get("head") + "|" +
                                     xmlpost_rel.get("relation"),
                                 )
                     except:
                         pass
                 sents.append(word)
             if sent or relation:
                 results.append(sents)
             else:
                 results.extend(sents)
     return LazyMap(lambda x: x, results)