def __recursive_tag_info_retriever(self, parent_tag, element): """ Dive into a xml element to retrieve all the information from it and its children. :param element: :String: An xml element of the TEI document to parse :return: The textual information contained in the element """ element_information = {} elements_iterator = element.getchildren() element_tag = element.tag.rsplit('}', 1)[-1] element_tag = u'_' \ if element_tag == u'teiHeader'\ else u'%s#%s' % (parent_tag, element_tag) for child in elements_iterator: has_child = child.getchildren() # The node is not a leaf if has_child: if 'rend=' not in child.attrib: # If the child element is only there for rendering has_child = False element_information = merge_two_dicts( self.__recursive_tag_info_retriever( element_tag, child), element_information) # The node is a leaf if not has_child: if child.text: try: text = u''.join(child.itertext()) except ValueError: text = child.text (normalized_text, normalized_tag) = self.__normalize_metadata( text, child.tag) else: # Self-closing element (normalized_text, normalized_tag) = self.__normalize_metadata( "", child.tag) element_information = merge_two_dicts( { u'%s#%s' % (element_tag, normalized_tag): (self._ATTR_CMPT, normalized_text) }, element_information) for attribute_key, attribute_value in child.attrib.items(): (normalized_attribute_value, normalized_attribute_key) =\ self.__normalize_metadata(attribute_value, normalized_tag + ':' + attribute_key) element_information = merge_two_dicts( { u'%s#%s' % (element_tag, normalized_attribute_key): (self._ATTR_CMPT, normalized_attribute_value) }, element_information) self._ATTR_CMPT += 1 return element_information
def __parse_header(self): """Parses the header of an xml (or tei) document. It keeps all the attributes-values pairs encoded in the header, except for those explicitly excluded by the self.__clean_metadata() method. the result for the current document is stored in self.metadata.""" # Parsing the document header self.etree_root = self.etree_xml.getroot() if self.etree_root is None: return metadata_root = self.etree_root.find(self.namespace + u'teiHeader') self.header_metadata = merge_two_dicts( self.header_metadata, self.__recursive_tag_info_retriever(u'', metadata_root)) # Adding Ark identifier if it exists ark_id = self.etree_root.attrib.get( '{http://www.w3.org/XML/1998/namespace}id', None) if ark_id: self.header_metadata[u'ark'] = (0, ark_id) # Adding language from xml:lang="" <TEI> if language not in header_metadata language_tei = self.etree_root.attrib.get( '{http://www.w3.org/XML/1998/namespace}lang', None) if '_#profileDesc#langUsage#language:ident' not in self.header_metadata: if language_tei: self.header_metadata[ u'_#profileDesc#langUsage#language:ident'] = (0, language_tei) # Cleaning useless or empty entries self.__clean_metadata()
def test_utils_merge_two_dicts1(): """Recursive merge & append dict values: Should pass""" dic_y = { 'both': { 'both_y_diff': 'bar', 'both_same': 'same_y' }, 'only_y': 'only_y' } dic_x = { 'both': { 'both_x_diff': 'foo', 'both_same': 'same_x' }, 'only_x': { 'only_x': 'baz' } } merged = merge_two_dicts(dic_x, dic_y) truth = { 'both': { 'both_same': ['same_x', 'same_y'], 'both_x_diff': 'foo', 'both_y_diff': 'bar' }, 'only_x': { 'only_x': 'baz' }, 'only_y': 'only_y' } assert cmp(merged, truth) == 0
def test_utils_merge_two_dicts2(): """Recursive merge & append dict values: Should pass""" x = {'both1': 'botha1x', 'both2': 'botha2x', 'only_x': 'only_x'} y = {'both1': 'botha1y', 'both2': 'botha2y', 'only_y': 'only_y'} merged = merge_two_dicts(x, y) truth = { 'both1': ['botha1x', 'botha1y'], 'both2': ['botha2x', 'botha2y'], 'only_x': 'only_x', 'only_y': 'only_y' } assert cmp(merged, truth) == 0
def _transform_header_metadata_with_keyword(self): """ In order to be able to access easily each type of information, this function ensures that the key of the metadata dict is a simple entrypoint (eg. author, title, date...), and the values are detailed by types (parents in the xml as well as the attributes). For instance, the xml: <teiHeader> <fileDesc> <titleStmt> <title>Histoire de l'Academie françoise ...</title> <author role="Auteur du texte" key="11918095">Olivet, Pierre-Joseph d' (1682-1768)</author> <author role="Auteur du texte" key="12180933">Pellisson-Fontanier, Paul (1624-1693)</author> </titleStmt> <publicationStmt> <publisher>TGB (BnF – OBVIL)</publisher> </publicationStmt> <seriesStmt> <title level="s">Histoire de l'Academie françoise ...</title> <title level="a">Tome 1</title> <biblScope unit="volumes" n="2" /> <idno>cb32496228k</idno> </seriesStmt> <sourceDesc> <bibl> <idno>http://gallica.bnf.fr/ark:/12148/bpt6k96039981</idno> <publisher>Jean-Baptiste Coignard fils</publisher> <date when="1729">1729</date> </bibl> </sourceDesc> </fileDesc> </teiHeader> should have the following self.header_metadata representation: { u'author': {u'#fileDesc': {u'author': [u'Pellisson-Fontanier, Paul (1624-1693)', u"Olivet, Pierre-Joseph d' (1682-1768)"]}, u'key': [u'12180933', u'11918095'], u'role': [u'Auteur du texte', u'Auteur du texte']}, u'date': {u'#fileDesc#sourceDesc': {u'date': [u'17#29']}, u'when': [u'17#29']}, u'idno': {u'#fileDesc': {u'idno': [u'cb32496228k']}, u'#fileDesc#sourceDesc': {u'idno': [u'http://gallica.bnf.fr/ark:/12148/bpt6k96039981']}}, u'publisher': {u'#fileDesc': {u'publisher': [u'TGB (BnF \u2013 OBVIL)']}, u'#fileDesc#sourceDesc': {u'publisher': [u'Jean-Baptiste Coignard fils']}}, u'title': {u'#fileDesc': {u'title': [u'Tome 1', u"Histoire de l'Academie fran\xe7oise ...", u"Histoire de l'Academie fran\xe7oise ..."]}, u'level': [u'a', u's']} } """ new_dic = {} for (k, v) in self.header_metadata.items(): if v: (xml_parent, _, xml_key_with_optional_attribute) = k.rpartition('#') if ':' in xml_key_with_optional_attribute: xml_key, _, xml_attribute = xml_key_with_optional_attribute.rpartition( ':') else: (xml_key, xml_attribute) = (xml_key_with_optional_attribute, xml_key_with_optional_attribute) current_attribute_dict = { xml_key: { xml_parent: { xml_attribute: v } } } new_dic = merge_two_dicts(new_dic, current_attribute_dict) self.header_metadata = new_dic