Example #1
0
    def __recursive_tag_info_retriever(self, parent_tag, element):
        """
        Dive into a xml element to retrieve all the information from
        it and its children.
        :param element: :String: An xml element of the TEI document to parse
        :return: The textual information contained in the element
        """
        element_information = {}
        elements_iterator = element.getchildren()
        element_tag = element.tag.rsplit('}', 1)[-1]
        element_tag = u'_' \
            if element_tag == u'teiHeader'\
            else u'%s#%s' % (parent_tag, element_tag)

        for child in elements_iterator:
            has_child = child.getchildren()
            # The node is not a leaf
            if has_child:
                if 'rend=' not in child.attrib:  # If the child element is only there for rendering
                    has_child = False
                    element_information = merge_two_dicts(
                        self.__recursive_tag_info_retriever(
                            element_tag, child), element_information)
            # The node is a leaf
            if not has_child:
                if child.text:
                    try:
                        text = u''.join(child.itertext())
                    except ValueError:
                        text = child.text
                    (normalized_text,
                     normalized_tag) = self.__normalize_metadata(
                         text, child.tag)
                else:
                    # Self-closing element
                    (normalized_text,
                     normalized_tag) = self.__normalize_metadata(
                         "", child.tag)

                element_information = merge_two_dicts(
                    {
                        u'%s#%s' % (element_tag, normalized_tag):
                        (self._ATTR_CMPT, normalized_text)
                    }, element_information)

                for attribute_key, attribute_value in child.attrib.items():
                    (normalized_attribute_value, normalized_attribute_key) =\
                        self.__normalize_metadata(attribute_value, normalized_tag + ':' + attribute_key)
                    element_information = merge_two_dicts(
                        {
                            u'%s#%s' % (element_tag, normalized_attribute_key):
                            (self._ATTR_CMPT, normalized_attribute_value)
                        }, element_information)
                self._ATTR_CMPT += 1

        return element_information
Example #2
0
    def __parse_header(self):
        """Parses the header of an xml (or tei) document.
        It keeps all the attributes-values pairs encoded in the header,
        except for those explicitly excluded by the self.__clean_metadata()
        method.
        the result for the current document is stored in self.metadata."""

        # Parsing the document header
        self.etree_root = self.etree_xml.getroot()
        if self.etree_root is None:
            return

        metadata_root = self.etree_root.find(self.namespace + u'teiHeader')
        self.header_metadata = merge_two_dicts(
            self.header_metadata,
            self.__recursive_tag_info_retriever(u'', metadata_root))

        # Adding Ark identifier if it exists
        ark_id = self.etree_root.attrib.get(
            '{http://www.w3.org/XML/1998/namespace}id', None)
        if ark_id:
            self.header_metadata[u'ark'] = (0, ark_id)

        # Adding language from xml:lang="" <TEI> if language not in header_metadata
        language_tei = self.etree_root.attrib.get(
            '{http://www.w3.org/XML/1998/namespace}lang', None)

        if '_#profileDesc#langUsage#language:ident' not in self.header_metadata:
            if language_tei:
                self.header_metadata[
                    u'_#profileDesc#langUsage#language:ident'] = (0,
                                                                  language_tei)
        # Cleaning useless or empty entries
        self.__clean_metadata()
Example #3
0
def test_utils_merge_two_dicts1():
    """Recursive merge & append dict values: Should pass"""

    dic_y = {
        'both': {
            'both_y_diff': 'bar',
            'both_same': 'same_y'
        },
        'only_y': 'only_y'
    }
    dic_x = {
        'both': {
            'both_x_diff': 'foo',
            'both_same': 'same_x'
        },
        'only_x': {
            'only_x': 'baz'
        }
    }
    merged = merge_two_dicts(dic_x, dic_y)

    truth = {
        'both': {
            'both_same': ['same_x', 'same_y'],
            'both_x_diff': 'foo',
            'both_y_diff': 'bar'
        },
        'only_x': {
            'only_x': 'baz'
        },
        'only_y': 'only_y'
    }
    assert cmp(merged, truth) == 0
Example #4
0
def test_utils_merge_two_dicts2():
    """Recursive merge & append dict values: Should pass"""

    x = {'both1': 'botha1x', 'both2': 'botha2x', 'only_x': 'only_x'}
    y = {'both1': 'botha1y', 'both2': 'botha2y', 'only_y': 'only_y'}
    merged = merge_two_dicts(x, y)
    truth = {
        'both1': ['botha1x', 'botha1y'],
        'both2': ['botha2x', 'botha2y'],
        'only_x': 'only_x',
        'only_y': 'only_y'
    }
    assert cmp(merged, truth) == 0
Example #5
0
    def _transform_header_metadata_with_keyword(self):
        """ In order to be able to access easily each type of information,
         this function ensures that the key of the metadata dict is a simple entrypoint
         (eg. author, title, date...), and the values are detailed by types
         (parents in the xml as well as the attributes). For instance, the xml:

         <teiHeader>
            <fileDesc>
               <titleStmt>
                  <title>Histoire de l'Academie françoise ...</title>
                  <author role="Auteur du texte" key="11918095">Olivet, Pierre-Joseph d' (1682-1768)</author>
                  <author role="Auteur du texte" key="12180933">Pellisson-Fontanier, Paul (1624-1693)</author>
               </titleStmt>
               <publicationStmt>
                  <publisher>TGB (BnF – OBVIL)</publisher>
               </publicationStmt>
               <seriesStmt>
                  <title level="s">Histoire de l'Academie françoise ...</title>
                  <title level="a">Tome 1</title>
                  <biblScope unit="volumes" n="2" />
                  <idno>cb32496228k</idno>
               </seriesStmt>
               <sourceDesc>
                  <bibl>
                     <idno>http://gallica.bnf.fr/ark:/12148/bpt6k96039981</idno>
                     <publisher>Jean-Baptiste Coignard fils</publisher>
                     <date when="1729">1729</date>
                  </bibl>
               </sourceDesc>
            </fileDesc>
         </teiHeader>

        should have the following self.header_metadata representation:
        {
            u'author': {u'#fileDesc': {u'author': [u'Pellisson-Fontanier, Paul (1624-1693)',
                                                   u"Olivet, Pierre-Joseph d' (1682-1768)"]},
            u'key': [u'12180933', u'11918095'],
            u'role': [u'Auteur du texte', u'Auteur du texte']},
            u'date': {u'#fileDesc#sourceDesc': {u'date': [u'17#29']},
                                                u'when': [u'17#29']},
            u'idno': {u'#fileDesc':
                            {u'idno': [u'cb32496228k']},
                      u'#fileDesc#sourceDesc':
                            {u'idno': [u'http://gallica.bnf.fr/ark:/12148/bpt6k96039981']}},
            u'publisher': {u'#fileDesc': {u'publisher': [u'TGB (BnF \u2013 OBVIL)']},
            u'#fileDesc#sourceDesc': {u'publisher': [u'Jean-Baptiste Coignard fils']}},
            u'title': {u'#fileDesc': {u'title': [u'Tome 1',
                                                 u"Histoire de l'Academie fran\xe7oise ...",
                                                 u"Histoire de l'Academie fran\xe7oise ..."]},
            u'level': [u'a', u's']}
        }

        """
        new_dic = {}
        for (k, v) in self.header_metadata.items():
            if v:
                (xml_parent, _,
                 xml_key_with_optional_attribute) = k.rpartition('#')
                if ':' in xml_key_with_optional_attribute:
                    xml_key, _, xml_attribute = xml_key_with_optional_attribute.rpartition(
                        ':')
                else:
                    (xml_key,
                     xml_attribute) = (xml_key_with_optional_attribute,
                                       xml_key_with_optional_attribute)

                current_attribute_dict = {
                    xml_key: {
                        xml_parent: {
                            xml_attribute: v
                        }
                    }
                }

                new_dic = merge_two_dicts(new_dic, current_attribute_dict)

        self.header_metadata = new_dic