Ejemplo n.º 1
0
    def view_selection(self, req, resp, url):
        """
        View the highlighted selector (from `action_view`)
        """
        from deliverance.selector import Selector
        doc = document_fromstring(resp.body)
        el = Element('base')
        el.set('href', posixpath.dirname(url) + '/')
        doc.head.insert(0, el)
        selector = Selector.parse(req.GET['selector'])
        dummy_type, elements, dummy_attributes = selector(doc)
        if not elements:
            template = self._not_found_template
        else:
            template = self._found_template
        all_elements = []
        els_in_head = False
        for index, el in enumerate(elements):
            el_in_head = self._el_in_head(el)
            if el_in_head:
                els_in_head = True
            anchor = 'deliverance-selection'
            if index:
                anchor += '-%s' % index
            if el.get('id'):
                anchor = el.get('id')
            ## FIXME: is a <a name> better?
            if not el_in_head:
                el.set('id', anchor)
            else:
                anchor = None
            ## FIXME: add :target CSS rule
            ## FIXME: or better, some Javascript
            all_elements.append((anchor, el))
            if not el_in_head:
                style = el.get('style', '')
                if style:
                    style += '; '
                style += '/* deliverance */ border: 2px dotted #f00'
                el.set('style', style)
            else:
                el.set('DELIVERANCE-MATCH', '1')

        def highlight(html_code):
            """Highlights the given code (for use in the template)"""
            if isinstance(html_code, _Element):
                html_code = tostring(html_code)
            return html(
                pygments_highlight(html_code, HtmlLexer(),
                                   HtmlFormatter(noclasses=True)))

        def format_tag(tag):
            """Highlights the lxml HTML tag"""
            return highlight(tostring(tag).split('>')[0] + '>')

        def wrap_html(html, width=100):
            if isinstance(html, _Element):
                html = tostring(html)
            lines = html.splitlines()
            new_lines = []

            def wrap_html_line(line):
                if len(line) <= width:
                    return [line]
                match_trail = re.search(r'^[^<]*</.*?>', line, re.S)
                if match_trail:
                    result = [match_trail.group(0)]
                    result.extend(wrap_html_line(line[match_trail.end():]))
                    return result
                match1 = re.search(r'^[^<]*<[^>]*>', line, re.S)
                match2 = re.search(r'<[^>]*>[^<>]*$', line, re.S)
                if not match1 or not match2:
                    return [line]
                result = [match1.group(0)]
                result.extend(wrap_html_line(
                    line[match1.end():match2.start()]))
                result.append(match2.group(0))
                return result

            for line in lines:
                new_lines.extend(wrap_html_line(line))
            return '\n'.join(new_lines)

        def mark_deliv_match(highlighted_text):
            result = re.sub(
                r'(?:<[^/][^>]*>)*&lt;.*?DELIVERANCE-MATCH=.*?&gt;(?:</[^>]*>)*',
                lambda match: r'<b style="background-color: #ff8">%s</b>' %
                match.group(0), unicode(highlighted_text), re.S)
            return html(result)

        text = template.substitute(base_url=url,
                                   els_in_head=els_in_head,
                                   doc=doc,
                                   elements=all_elements,
                                   selector=selector,
                                   format_tag=format_tag,
                                   highlight=highlight,
                                   wrap_html=wrap_html,
                                   mark_deliv_match=mark_deliv_match)
        message = fromstring(
            self._message_template.substitute(message=text, url=url))
        if doc.body.text:
            message.tail = doc.body.text
            doc.body.text = ''
        doc.body.insert(0, message)
        text = tostring(doc)
        return Response(text)
Ejemplo n.º 2
0
 def view_selection(self, req, resp, url):
     """
     View the highlighted selector (from `action_view`)
     """
     from deliverance.selector import Selector
     doc = document_fromstring(resp.body)
     el = Element('base')
     el.set('href', posixpath.dirname(url) + '/')
     doc.head.insert(0, el)
     selector = Selector.parse(req.GET['selector'])
     dummy_type, elements, dummy_attributes = selector(doc)
     if not elements:
         template = self._not_found_template
     else:
         template = self._found_template
     all_elements = []
     els_in_head = False
     for index, el in enumerate(elements):
         el_in_head = self._el_in_head(el)
         if el_in_head:
             els_in_head = True
         anchor = 'deliverance-selection'
         if index:
             anchor += '-%s' % index
         if el.get('id'):
             anchor = el.get('id')
         ## FIXME: is a <a name> better?
         if not el_in_head:
             el.set('id', anchor)
         else:
             anchor = None
         ## FIXME: add :target CSS rule
         ## FIXME: or better, some Javascript
         all_elements.append((anchor, el))
         if not el_in_head:
             style = el.get('style', '')
             if style:
                 style += '; '
             style += '/* deliverance */ border: 2px dotted #f00'
             el.set('style', style)
         else:
             el.set('DELIVERANCE-MATCH', '1')
     def highlight(html_code):
         """Highlights the given code (for use in the template)"""
         if isinstance(html_code, _Element):
             html_code = tostring(html_code)
         return html(pygments_highlight(html_code, HtmlLexer(),
                                        HtmlFormatter(noclasses=True)))
     def format_tag(tag):
         """Highlights the lxml HTML tag"""
         return highlight(tostring(tag).split('>')[0]+'>')
     def wrap_html(html, width=100):
         if isinstance(html, _Element):
             html = tostring(html)
         lines = html.splitlines()
         new_lines = []
         def wrap_html_line(line):
             if len(line) <= width:
                 return [line]
             match_trail = re.search(r'^[^<]*</.*?>', line, re.S)
             if match_trail:
                 result = [match_trail.group(0)]
                 result.extend(wrap_html_line(line[match_trail.end():]))
                 return result
             match1 = re.search(r'^[^<]*<[^>]*>', line, re.S)
             match2 = re.search(r'<[^>]*>[^<>]*$', line, re.S)
             if not match1 or not match2:
                 return [line]
             result = [match1.group(0)]
             result.extend(wrap_html_line(line[match1.end():match2.start()]))
             result.append(match2.group(0))
             return result
         for line in lines:
             new_lines.extend(wrap_html_line(line))
         return '\n'.join(new_lines)
     def mark_deliv_match(highlighted_text):
         result = re.sub(r'(?:<[^/][^>]*>)*&lt;.*?DELIVERANCE-MATCH=.*?&gt;(?:</[^>]*>)*', lambda match: r'<b style="background-color: #ff8">%s</b>' % match.group(0), unicode(highlighted_text), re.S)
         return html(result)
     text = template.substitute(
         base_url=url,
         els_in_head=els_in_head, doc=doc,
         elements=all_elements, selector=selector, 
         format_tag=format_tag, highlight=highlight, 
         wrap_html=wrap_html, mark_deliv_match=mark_deliv_match)
     message = fromstring(
         self._message_template.substitute(message=text, url=url))
     if doc.body.text:
         message.tail = doc.body.text
         doc.body.text = ''
     doc.body.insert(0, message)
     text = tostring(doc)
     return Response(text)
def split_and_output(input_root,
                     template_file_name,
                     input_file_name,
                     output_folder=''):

    output_tree = html.parse(template_file_name)
    # output_root = output_tree.getroot()
    # put element lists in dict with file_lable as the key
    file_lables_element_lists = {'op': [], 'new_fb': [], 'an': []}
    # select all the paragraphs etc within the top levle divs
    paragraph_elements = input_root.xpath('//body/div/*')
    list_to_add_to = file_lables_element_lists['op']

    # look through all the paragraph elemets and find out if any are Annoncements etc
    for paragraph_element in paragraph_elements:

        # if (paragraph_element.get('class') == 'DocumentTitle'
        #      and 'PART 2' in paragraph_element.text_content().upper()):
        #     # start new list for future business
        #     list_to_add_to = file_lables_element_lists['new_fb']
        list_to_add_to.append(paragraph_element)

    # build up output trees
    for file_lable, element_list in file_lables_element_lists.items():
        if len(element_list) != 0:

            # copy the template tree and add elements needed for this section
            temp_output_tree = deepcopy(output_tree)
            # temp_output_tree = html.parse(template_file_name)
            temp_output_root = temp_output_tree.getroot()

            # change the title
            if file_lable == 'op':
                title_text = 'Order Paper for ' + DATES.sitting_date_medium
                h1_text = 'Order Paper for ' + DATES.sitting_date_long
            # elif file_lable == 'new_fb':
            #     title_text = 'Future Business as of ' + DATES.sitting_date_medium
            #     h1_text = 'Future Business as of ' + DATES.sitting_date_long
            temp_output_root.xpath('//h1[@id="mainTitle"]')[0].text = h1_text
            temp_output_root.xpath('//head/title')[0].text = title_text

            # get the position (in the template) where we will inject html (from the input)
            code_injection_point = temp_output_root.xpath(
                '//div[@id="content-goes-here"]')[0]
            for element in element_list:
                # remove Future Business heading from start of part 2
                if 'paraChamberSummaryHeading' in element.classes:
                    if element.text_content().lower() == 'future business':
                        continue
                # remove the docuemnt headings from the html i.e. part 1 head
                if 'DocumentTitle' in element.classes:
                    text_content = element.text_content().lower()
                    h2 = Element('h2')
                    h2.set('class', 'OP-heading-outdent')
                    if 'part 1' in text_content:
                        h2.text = 'Part 1: Business Today'
                        code_injection_point.append(h2)
                    elif 'part 2' in text_content:
                        h2.text = 'Part 2: Future Business'
                        code_injection_point.append(h2)
                else:
                    code_injection_point.append(element)

            # Add IDs and perminant ancors to the html
            # Added at the request of IDMS
            # need to get all the heading elements
            xpath = '//h1|//h2|//h3|//h4|//h5|//h6|//*[@class="paraBusinessItemHeading"]' \
                    '|//*[@class="paraBusinessItemHeading-bulleted"]|//*[@class="FbaLocation"]'
            headings = temp_output_root.xpath(xpath)
            for i, heading in enumerate(headings):
                # generate id text
                id_text = f'{DATES.sitting_date_compact}-{i}'

                if heading.get('id', default=None):
                    heading.set('name', heading.get('id'))

                heading.set('id', id_text)
                # parmalink_span = SubElement(heading, 'span')
                # parmalink_span.set('class', 'perma-link')
                # anchor = SubElement(parmalink_span, 'a')
                anchor = SubElement(heading, 'a')
                permalink_for = 'Permalink for ' + heading.text_content()
                anchor.set('href', '#' + id_text)
                anchor.set('aria-label', 'Anchor')
                anchor.set('title', permalink_for)
                anchor.set('data-anchor-icon', '§')
                anchor.set('class', 'anchor-link')

            # create the tables of contents
            # This will be overridden by tocbot.
            # We still want a ToC even if JavaScript is dissabled...

            # find where to put the Toc
            nav_xpath_results = temp_output_root.xpath('//nav[@id="toc"][1]')

            # look for all the h2's
            # // Where to grab the headings to build the table of contents.
            # contentSelector: '.js-toc-content'
            h2s = temp_output_root.xpath(
                '//*[contains(@class, "js-toc-content")]//h2')

            if len(nav_xpath_results):
                toc_injection_point = nav_xpath_results[0]
                ol = SubElement(toc_injection_point, 'ol')
                ol.set('class', 'toc-list')
                for h2 in h2s:
                    li = SubElement(ol, 'li')
                    li.set('class', 'toc-list-item')

                    a = SubElement(li, 'a')
                    a.set('href', '#' + h2.get('id', ''))
                    a.set('class', 'toc-link')
                    a.text = h2.text_content()
            else:
                print('no element')

            # itterate through tree and remove CR from tail and text
            for element in temp_output_root.iter():
                if element.tail:
                    element.tail = element.tail.replace('\r', '')
                if element.text:
                    element.text = element.text.replace('\r', '')

            # write out the output html files
            # outputfile_name = os.path.join(os.path.dirname(input_file_name),
            #                                file_lable + DATES.sitting_date_compact[2:] + fileextension)

            outputfile_name = f'{file_lable}{DATES.sitting_date_compact[2:]}{ fileextension}'
            if output_folder:
                outputfile_path = Path(output_folder).joinpath(outputfile_name)
                # print(outputfile_path)
            else:
                outputfile_path = Path(input_file_name).parent.joinpath(
                    outputfile_name)

            # created element tree so we can use write method
            # temp_output_tree = ElementTree(temp_output_root)
            temp_output_tree.write(str(outputfile_path),
                                   doctype=DOCTYPE,
                                   encoding='UTF-8',
                                   method="html",
                                   xml_declaration=False)
            print(f'{file_lable} file is at:\t{outputfile_path}')