Esempio n. 1
0
 def convert_address_linking_elements(self, top):
     """
     The Journal Publishing Tag Set defines the following elements as
     address linking elements: <email>, <ext-link>, <uri>. The only
     appropriate hypertext element for linking in OPS is the <a> element.
     """
     #Convert email to a mailto link addressed to the text it contains
     for email in top.findall('.//email'):
         element_methods.remove_all_attributes(email)
         email.tag = 'a'
         email.attrib['href'] = 'mailto:{0}'.format(email.text)
     #Ext-links often declare their address as xlink:href attribute
     #if that fails, direct the link to the contained text
     for ext_link in top.findall('.//ext-link'):
         ext_link.tag = 'a'
         xlink_href_name = element_methods.ns_format(ext_link, 'xlink:href')
         xlink_href = element_methods.get_attribute(ext_link, xlink_href_name)
         element_methods.remove_all_attributes(ext_link, exclude=['id'])
         if xlink_href:
             ext_link.attrib['href'] = xlink_href
         else:
             ext_link.attrib['href'] = element_methods.all_text(ext_link)
     #Uris often declare their address as xlink:href attribute
     #if that fails, direct the link to the contained text
     for uri in top.findall('.//uri'):
         uri.tag = 'a'
         xlink_href_name = element_methods.ns_format(uri, 'xlink:href')
         xlink_href = element_methods.get_attribute(uri, xlink_href_name)
         element_methods.remove_all_attributes(uri)
         if xlink_href:
             uri.attrib['href'] = xlink_href
         else:
             uri.attrib['href'] = element_methods.all_text(uri)
def frontiers_dc_date(article):
    """
    Given an Article class instance, this provides the method for extracting
    important dates in the history of the article. These are returned as a list
    of Date(year, month, day, event). This method looks specifically to locate
    the dates when Frontiers accepted the article and when it was published
    online.
    """
    date_list = []
    history = article.metadata.front.article_meta.history
    if history is None:
        return date_list
    #Creation is a Dublin Core event value: I interpret it as the date of acceptance
    #For some reason, the lxml dtd parser fails to recognize the content model
    #history (something to do with expanded content model? I am not sure yet)
    #So for now, this will illustrate a work-around using lxml search
    for date in history.node.findall('date'):
        if not 'date-type' in date.attrib:
            continue
        if date.attrib['date-type'] == 'accepted':
            year_el = date.find('year')
            month_el = date.find('month')
            day_el = date.find('day')
            if year_el is not None:
                year = element_methods.all_text(year_el)
            else:
                year = ''
            if month_el is not None:
                month = element_methods.all_text(month_el)
            else:
                month = ''
            if day_el is not None:
                day = element_methods.all_text(day_el)
            date_list.append(date_tup(year, month, day, 'creation'))

    #Publication is another Dublin Core event value: I use date of epub
    pub_dates = article.metadata.front.article_meta.pub_date
    for pub_date in pub_dates:
        if pub_date.attrs['pub-type'] == 'epub':
            date_list.append(date_tup(pub_date.year.text, pub_date.month.text,
                                      pub_date.day.text, 'publication'))
    return date_list
Esempio n. 3
0
    def recursive_article_navmap(self, src_element, depth=0, first=True):
        """
        This function recursively traverses the content of an input article to
        add the correct elements to the NCX file's navMap and Lists.
        """
        if depth > self.nav_depth:
            self.nav_depth = depth
        navpoints = []
        tagnames = ['sec', 'fig', 'table-wrap']
        for child in src_element:
            try:
                tagname = child.tag
            except AttributeError:
                continue
            else:
                if tagname not in tagnames:
                    continue

            #Safely handle missing id attributes
            if 'id' not in child.attrib:
                child.attrib['id'] = self.auto_id

            #If in collection mode, we'll prepend the article DOI to avoid
            #collisions
            if self.collection:
                child_id = '-'.join([self.article_doi, child.attrib['id']])
            else:
                child_id = child.attrib['id']

            #Attempt to infer the correct text as a label
            #Skip the element if we cannot
            child_title = child.find('title')
            if child_title is None:
                continue  # If there is no immediate title, skip this element
            label = element_methods.all_text(child_title)
            if not label:
                continue  # If no text in the title, skip this element
            source = 'main.{0}.xhtml#{1}'.format(self.article_doi,
                                                 child.attrib['id'])
            if tagname == 'sec':
                children = self.recursive_article_navmap(child,
                                                         depth=depth + 1)
                navpoints.append(
                    navpoint(child_id, label, self.play_order, source,
                             children))
            #figs and table-wraps do not have children
            elif tagname == 'fig':  # Add navpoints to list_of_figures
                self.figures_list.append(
                    navpoint(child.attrib['id'], label, None, source, []))
            elif tagname == 'table-wrap':  # Add navpoints to list_of_tables
                self.tables_list.append(
                    navpoint(child.attrib['id'], label, None, source, []))
        return navpoints
Esempio n. 4
0
 def recursive_article_navmap(self, src_element, depth=0, first=True):
     """
     This function recursively traverses the content of an input article to
     add the correct elements to the NCX file's navMap and Lists.
     """
     #TODO: This may need modification for non JPTS
     if depth > self.maxdepth:
         self.maxdepth = depth
     navpoints = []
     tagnames = ['sec', 'fig', 'table-wrap']
     for child in src_element:
         try:
             tagname = child.tag
         except AttributeError:  # Text nodes have no attribute tagName
             continue
         else:
             if tagname not in tagnames:
                 continue
         source_id = child.attrib['id']
         #In single mode, use the id as it is
         if not self.collection_mode:
             child_id = source_id
         #If in collection_mode, prepend the article_doi to avoid collisions
         else:
             child_id = '{0}-{1}'.format(self.article_doi, source_id)
         #Attempt to pull the title text as a label for the navpoint
         child_title = child.find('title')
         if child_title is None:
             continue
         label = element_methods.all_text(child_title)
         if not label:
             continue
         source = 'main.{0}.xml#{1}'.format(self.article_doi, source_id)
         if tagname == 'sec':
             play_order = self.pull_play_order()
             children = self.recursive_article_navmap(child, depth=depth+1)
             new_nav = navpoint(child_id, label, play_order, source, children)
             navpoints.append(new_nav)
         #figs and table-wraps do not have children
         elif tagname == 'fig':  # Add navpoints to list_of_figures
             new_nav = navtarget(child_id, label, source)
             self.list_of_figures.append(new_nav)
         elif tagname == 'table-wrap':  # Add navpoints to list_of_tables
             new_nav = navtarget(child_id, label, source)
             self.list_of_tables.append(new_nav)
     return navpoints
Esempio n. 5
0
        def recursive_element_packing(element):
            if element is None:
                return None
            tagname = element.tag
            element_def = dtd_dict[tagname]
            #Create lists for field names and field values
            field_names = []
            field_vals = []
            #Create a self reference, named node, value is the element itself
            field_names.append('node')
            field_vals.append(element)
            #Handle attributes
            attrs = {}  # Dict to hold attributes
            field_names.append('attrs')  # namedtuple attribute to receive dict
            #Compose the attrs dict with appropriate keys and values
            for attribute in element_def.iterattributes():
                if attribute.prefix:
                    if attribute.prefix == 'xmlns':  # Pseudo-attribute
                        continue
                    elif attribute.prefix == 'xml':
                        attr_lookup = '{{http://www.w3.org/XML/1998/namespace}}{0}'.format(attribute.name)
                    else:
                        attr_lookup = '{'+element.nsmap[attribute.prefix]+'}'+attribute.name
                    key = '{0}:{1}'.format(attribute.prefix, attribute.name)
                else:
                    key = attribute.name
                    attr_lookup = key
                #Add the value of the attribute to list of field values
                try:
                    value = element.attrib[attr_lookup]
                except KeyError:
                    attrs[key] = None  # Not worrying about implied defaults right now
                    #field_vals.append(None
                else:
                    attrs[key] = value
            #Add the attrs dict to field values
            field_vals.append(attrs)
            #Get the sub_elements for the element
            sub_elements = get_sub_elements(element_def.content, first=True)
            get_text = False  # A control variable, used later if PCDATA in content model
            for sub_element in sub_elements:
                #We have the sub elements according to tag and occurrence
                if sub_element.tag == 'pcdata':
                    get_text = True
                    continue
                if sub_element.occurrence == 'multiple':
                    child_tag = sub_element.tag
                    child_list = []
                    for each in element.findall(child_tag):
                        child_list.append(recursive_element_packing(each))
                    field_names.append(child_tag)
                    field_vals.append(child_list)
                else:
                    child_tag = sub_element.tag
                    child_element = element.find(child_tag)
                    if child_element is not None:
                        child = recursive_element_packing(child_element)
                    else:
                        child = None
                    field_names.append(child_tag)
                    field_vals.append(child)
            if get_text:
                field_names.append('text')
                field_vals.append(element_methods.all_text(element))

            #Make items in field_names safe for namedtuple
            #Coerce characters in string
            field_names = [coerce_string(i) for i in field_names]
            #Prepend 'l' to reserved keywords for element tagname
            if iskeyword(tagname):
                tagname = 'l' + tagname
            #Prepend 'l' to reserved keywords for sub_elements
            field_names = ['l'+i if iskeyword(i) else i for i in field_names]

            data_tuple = namedtuple(coerce_string(tagname), ', '.join(field_names))
            return data_tuple(*field_vals)
Esempio n. 6
0
    def recursive_article_navmap(self, src_element, depth=0, first=True):
        """
        This function recursively traverses the content of an input article to
        add the correct elements to the NCX file's navMap and Lists.
        """
        if depth > self.nav_depth:
            self.nav_depth = depth
        navpoints = []
        tagnames = ['sec', 'fig', 'table-wrap']
        for child in src_element:
            try:
                tagname = child.tag
            except AttributeError:
                continue
            else:
                if tagname not in tagnames:
                    continue

            #Safely handle missing id attributes
            if 'id' not in child.attrib:
                child.attrib['id'] = self.auto_id

            #If in collection mode, we'll prepend the article DOI to avoid
            #collisions
            if self.collection:
                child_id = '-'.join([self.article_doi,
                                     child.attrib['id']])
            else:
                child_id = child.attrib['id']

            #Attempt to infer the correct text as a label
            #Skip the element if we cannot
            child_title = child.find('title')
            if child_title is None:
                continue  # If there is no immediate title, skip this element
            label = element_methods.all_text(child_title)
            if not label:
                continue  # If no text in the title, skip this element
            source = 'main.{0}.xhtml#{1}'.format(self.article_doi,
                                               child.attrib['id'])
            if tagname == 'sec':
                children = self.recursive_article_navmap(child, depth=depth + 1)
                navpoints.append(navpoint(child_id,
                                          label,
                                          self.play_order,
                                          source,
                                          children))
            #figs and table-wraps do not have children
            elif tagname == 'fig':  # Add navpoints to list_of_figures
                self.figures_list.append(navpoint(child.attrib['id'],
                                                  label,
                                                  None,
                                                  source,
                                                  []))
            elif tagname == 'table-wrap':  # Add navpoints to list_of_tables
                self.tables_list.append(navpoint(child.attrib['id'],
                                                 label,
                                                 None,
                                                 source,
                                                 []))
        return navpoints