Beispiel #1
0
 def clean(self, element):
     cleanElement = None
     dropEmpty = ('span', 'p', 'div') 
     downloadDir = self.task.getProperty('download')
     if 'img' == element.tag:
            src = urlparse.urljoin(self.url, element.attrib['src'])
            file, info = urllib.urlretrieve(src)
            url = urlparse.urlparse(src)
            disposition = info.getheader('Content-Disposition')
            filename = None
            if disposition:
                type, filename = disposition.split(';')
                key, filename = filename.split('=')
                filename = filename.strip('"')
            if not filename:
                filename = os.path.basename(file)
            splitf = filename.split('.')
            lenf = len(splitf)
            ext = splitf.pop()
            if lenf < 2 or info.subtype != ext:
                filename = '.'.join((filename, info.subtype))
            element.attrib['src']  = filename
            os.rename(file, '/'.join((downloadDir, filename)))
     #moin specific hack for now
     if 'a' == element.tag and '/Category' in element.attrib['href']:
         pass
     elif element.tag not in dropEmpty \
             or bool(element.getchildren()) \
             or (bool(element.text) \
                 and bool(element.text.strip())):
         cleanElement = Element(element.tag)
         cleanElement.text = element.text
         stripattribs = ('class', 'style', 'id')
         for a in element.attrib:
             if a not in stripattribs:
                 cleanElement.set(a, element.attrib[a])  
         for e in element.getchildren():
             clean = (self.clean(e))
             if clean is not None:
                 cleanElement.append(clean)
     return cleanElement        
Beispiel #2
0
 def view_selection(self, req, resp, url):
     """
     View the highlighted selector (from `action_view`)
     """
     from deliverance.selector import Selector
     doc = document_fromstring(resp.body)
     el = Element('base')
     el.set('href', posixpath.dirname(url) + '/')
     doc.head.insert(0, el)
     selector = Selector.parse(req.GET['selector'])
     dummy_type, elements, dummy_attributes = selector(doc)
     if not elements:
         template = self._not_found_template
     else:
         template = self._found_template
     all_elements = []
     els_in_head = False
     for index, el in enumerate(elements):
         el_in_head = self._el_in_head(el)
         if el_in_head:
             els_in_head = True
         anchor = 'deliverance-selection'
         if index:
             anchor += '-%s' % index
         if el.get('id'):
             anchor = el.get('id')
         ## FIXME: is a <a name> better?
         if not el_in_head:
             el.set('id', anchor)
         else:
             anchor = None
         ## FIXME: add :target CSS rule
         ## FIXME: or better, some Javascript
         all_elements.append((anchor, el))
         if not el_in_head:
             style = el.get('style', '')
             if style:
                 style += '; '
             style += '/* deliverance */ border: 2px dotted #f00'
             el.set('style', style)
         else:
             el.set('DELIVERANCE-MATCH', '1')
     def highlight(html_code):
         """Highlights the given code (for use in the template)"""
         if isinstance(html_code, _Element):
             html_code = tostring(html_code)
         return html(pygments_highlight(html_code, HtmlLexer(),
                                        HtmlFormatter(noclasses=True)))
     def format_tag(tag):
         """Highlights the lxml HTML tag"""
         return highlight(tostring(tag).split('>')[0]+'>')
     def wrap_html(html, width=100):
         if isinstance(html, _Element):
             html = tostring(html)
         lines = html.splitlines()
         new_lines = []
         def wrap_html_line(line):
             if len(line) <= width:
                 return [line]
             match_trail = re.search(r'^[^<]*</.*?>', line, re.S)
             if match_trail:
                 result = [match_trail.group(0)]
                 result.extend(wrap_html_line(line[match_trail.end():]))
                 return result
             match1 = re.search(r'^[^<]*<[^>]*>', line, re.S)
             match2 = re.search(r'<[^>]*>[^<>]*$', line, re.S)
             if not match1 or not match2:
                 return [line]
             result = [match1.group(0)]
             result.extend(wrap_html_line(line[match1.end():match2.start()]))
             result.append(match2.group(0))
             return result
         for line in lines:
             new_lines.extend(wrap_html_line(line))
         return '\n'.join(new_lines)
     def mark_deliv_match(highlighted_text):
         result = re.sub(r'(?:<[^/][^>]*>)*&lt;.*?DELIVERANCE-MATCH=.*?&gt;(?:</[^>]*>)*', lambda match: r'<b style="background-color: #ff8">%s</b>' % match.group(0), unicode(highlighted_text), re.S)
         return html(result)
     text = template.substitute(
         base_url=url,
         els_in_head=els_in_head, doc=doc,
         elements=all_elements, selector=selector, 
         format_tag=format_tag, highlight=highlight, 
         wrap_html=wrap_html, mark_deliv_match=mark_deliv_match)
     message = fromstring(
         self._message_template.substitute(message=text, url=url))
     if doc.body.text:
         message.tail = doc.body.text
         doc.body.text = ''
     doc.body.insert(0, message)
     text = tostring(doc)
     return Response(text)
Beispiel #3
0
    def view_selection(self, req, resp, url):
        """
        View the highlighted selector (from `action_view`)
        """
        from deliverance.selector import Selector
        doc = document_fromstring(resp.body)
        el = Element('base')
        el.set('href', posixpath.dirname(url) + '/')
        doc.head.insert(0, el)
        selector = Selector.parse(req.GET['selector'])
        dummy_type, elements, dummy_attributes = selector(doc)
        if not elements:
            template = self._not_found_template
        else:
            template = self._found_template
        all_elements = []
        els_in_head = False
        for index, el in enumerate(elements):
            el_in_head = self._el_in_head(el)
            if el_in_head:
                els_in_head = True
            anchor = 'deliverance-selection'
            if index:
                anchor += '-%s' % index
            if el.get('id'):
                anchor = el.get('id')
            ## FIXME: is a <a name> better?
            if not el_in_head:
                el.set('id', anchor)
            else:
                anchor = None
            ## FIXME: add :target CSS rule
            ## FIXME: or better, some Javascript
            all_elements.append((anchor, el))
            if not el_in_head:
                style = el.get('style', '')
                if style:
                    style += '; '
                style += '/* deliverance */ border: 2px dotted #f00'
                el.set('style', style)
            else:
                el.set('DELIVERANCE-MATCH', '1')

        def highlight(html_code):
            """Highlights the given code (for use in the template)"""
            if isinstance(html_code, _Element):
                html_code = tostring(html_code)
            return html(
                pygments_highlight(html_code, HtmlLexer(),
                                   HtmlFormatter(noclasses=True)))

        def format_tag(tag):
            """Highlights the lxml HTML tag"""
            return highlight(tostring(tag).split('>')[0] + '>')

        def wrap_html(html, width=100):
            if isinstance(html, _Element):
                html = tostring(html)
            lines = html.splitlines()
            new_lines = []

            def wrap_html_line(line):
                if len(line) <= width:
                    return [line]
                match_trail = re.search(r'^[^<]*</.*?>', line, re.S)
                if match_trail:
                    result = [match_trail.group(0)]
                    result.extend(wrap_html_line(line[match_trail.end():]))
                    return result
                match1 = re.search(r'^[^<]*<[^>]*>', line, re.S)
                match2 = re.search(r'<[^>]*>[^<>]*$', line, re.S)
                if not match1 or not match2:
                    return [line]
                result = [match1.group(0)]
                result.extend(wrap_html_line(
                    line[match1.end():match2.start()]))
                result.append(match2.group(0))
                return result

            for line in lines:
                new_lines.extend(wrap_html_line(line))
            return '\n'.join(new_lines)

        def mark_deliv_match(highlighted_text):
            result = re.sub(
                r'(?:<[^/][^>]*>)*&lt;.*?DELIVERANCE-MATCH=.*?&gt;(?:</[^>]*>)*',
                lambda match: r'<b style="background-color: #ff8">%s</b>' %
                match.group(0), unicode(highlighted_text), re.S)
            return html(result)

        text = template.substitute(base_url=url,
                                   els_in_head=els_in_head,
                                   doc=doc,
                                   elements=all_elements,
                                   selector=selector,
                                   format_tag=format_tag,
                                   highlight=highlight,
                                   wrap_html=wrap_html,
                                   mark_deliv_match=mark_deliv_match)
        message = fromstring(
            self._message_template.substitute(message=text, url=url))
        if doc.body.text:
            message.tail = doc.body.text
            doc.body.text = ''
        doc.body.insert(0, message)
        text = tostring(doc)
        return Response(text)
def split_and_output(input_root,
                     template_file_name,
                     input_file_name,
                     output_folder=''):

    output_tree = html.parse(template_file_name)
    # output_root = output_tree.getroot()
    # put element lists in dict with file_lable as the key
    file_lables_element_lists = {'op': [], 'new_fb': [], 'an': []}
    # select all the paragraphs etc within the top levle divs
    paragraph_elements = input_root.xpath('//body/div/*')
    list_to_add_to = file_lables_element_lists['op']

    # look through all the paragraph elemets and find out if any are Annoncements etc
    for paragraph_element in paragraph_elements:

        # if (paragraph_element.get('class') == 'DocumentTitle'
        #      and 'PART 2' in paragraph_element.text_content().upper()):
        #     # start new list for future business
        #     list_to_add_to = file_lables_element_lists['new_fb']
        list_to_add_to.append(paragraph_element)

    # build up output trees
    for file_lable, element_list in file_lables_element_lists.items():
        if len(element_list) != 0:

            # copy the template tree and add elements needed for this section
            temp_output_tree = deepcopy(output_tree)
            # temp_output_tree = html.parse(template_file_name)
            temp_output_root = temp_output_tree.getroot()

            # change the title
            if file_lable == 'op':
                title_text = 'Order Paper for ' + DATES.sitting_date_medium
                h1_text = 'Order Paper for ' + DATES.sitting_date_long
            # elif file_lable == 'new_fb':
            #     title_text = 'Future Business as of ' + DATES.sitting_date_medium
            #     h1_text = 'Future Business as of ' + DATES.sitting_date_long
            temp_output_root.xpath('//h1[@id="mainTitle"]')[0].text = h1_text
            temp_output_root.xpath('//head/title')[0].text = title_text

            # get the position (in the template) where we will inject html (from the input)
            code_injection_point = temp_output_root.xpath(
                '//div[@id="content-goes-here"]')[0]
            for element in element_list:
                # remove Future Business heading from start of part 2
                if 'paraChamberSummaryHeading' in element.classes:
                    if element.text_content().lower() == 'future business':
                        continue
                # remove the docuemnt headings from the html i.e. part 1 head
                if 'DocumentTitle' in element.classes:
                    text_content = element.text_content().lower()
                    h2 = Element('h2')
                    h2.set('class', 'OP-heading-outdent')
                    if 'part 1' in text_content:
                        h2.text = 'Part 1: Business Today'
                        code_injection_point.append(h2)
                    elif 'part 2' in text_content:
                        h2.text = 'Part 2: Future Business'
                        code_injection_point.append(h2)
                else:
                    code_injection_point.append(element)

            # Add IDs and perminant ancors to the html
            # Added at the request of IDMS
            # need to get all the heading elements
            xpath = '//h1|//h2|//h3|//h4|//h5|//h6|//*[@class="paraBusinessItemHeading"]' \
                    '|//*[@class="paraBusinessItemHeading-bulleted"]|//*[@class="FbaLocation"]'
            headings = temp_output_root.xpath(xpath)
            for i, heading in enumerate(headings):
                # generate id text
                id_text = f'{DATES.sitting_date_compact}-{i}'

                if heading.get('id', default=None):
                    heading.set('name', heading.get('id'))

                heading.set('id', id_text)
                # parmalink_span = SubElement(heading, 'span')
                # parmalink_span.set('class', 'perma-link')
                # anchor = SubElement(parmalink_span, 'a')
                anchor = SubElement(heading, 'a')
                permalink_for = 'Permalink for ' + heading.text_content()
                anchor.set('href', '#' + id_text)
                anchor.set('aria-label', 'Anchor')
                anchor.set('title', permalink_for)
                anchor.set('data-anchor-icon', '§')
                anchor.set('class', 'anchor-link')

            # create the tables of contents
            # This will be overridden by tocbot.
            # We still want a ToC even if JavaScript is dissabled...

            # find where to put the Toc
            nav_xpath_results = temp_output_root.xpath('//nav[@id="toc"][1]')

            # look for all the h2's
            # // Where to grab the headings to build the table of contents.
            # contentSelector: '.js-toc-content'
            h2s = temp_output_root.xpath(
                '//*[contains(@class, "js-toc-content")]//h2')

            if len(nav_xpath_results):
                toc_injection_point = nav_xpath_results[0]
                ol = SubElement(toc_injection_point, 'ol')
                ol.set('class', 'toc-list')
                for h2 in h2s:
                    li = SubElement(ol, 'li')
                    li.set('class', 'toc-list-item')

                    a = SubElement(li, 'a')
                    a.set('href', '#' + h2.get('id', ''))
                    a.set('class', 'toc-link')
                    a.text = h2.text_content()
            else:
                print('no element')

            # itterate through tree and remove CR from tail and text
            for element in temp_output_root.iter():
                if element.tail:
                    element.tail = element.tail.replace('\r', '')
                if element.text:
                    element.text = element.text.replace('\r', '')

            # write out the output html files
            # outputfile_name = os.path.join(os.path.dirname(input_file_name),
            #                                file_lable + DATES.sitting_date_compact[2:] + fileextension)

            outputfile_name = f'{file_lable}{DATES.sitting_date_compact[2:]}{ fileextension}'
            if output_folder:
                outputfile_path = Path(output_folder).joinpath(outputfile_name)
                # print(outputfile_path)
            else:
                outputfile_path = Path(input_file_name).parent.joinpath(
                    outputfile_name)

            # created element tree so we can use write method
            # temp_output_tree = ElementTree(temp_output_root)
            temp_output_tree.write(str(outputfile_path),
                                   doctype=DOCTYPE,
                                   encoding='UTF-8',
                                   method="html",
                                   xml_declaration=False)
            print(f'{file_lable} file is at:\t{outputfile_path}')
def massarge_input_file(input_file_name):

    # test for bad classes
    # try:
    bad_classes(input_file_name)
    # except:
    #     # we dont reall NEED to chaeck for bad classes so wond do anything here
    #     show_error('There was a problem when checking for bad classes.')

    input_root = html.parse(input_file_name).getroot()

    # remove the contents div
    contents_div = input_root.xpath('body/div[@class="Contents-Box"]')
    if len(contents_div) > 0:
        contents_div[0].getparent().remove(contents_div[0])

    # remove all the _idGenParaOverrides
    all_paragraphs = input_root.xpath('//p|//h1|//h2|//h3|//h4|//h5|//h6')
    for paragraph in all_paragraphs:
        if re.search(r' ?_idGenParraOveride\d\d?\d?',
                     paragraph.get('class', default='')) is not None:
            print('override')
            paragraph.set(
                'class',
                re.sub(r' ?_idGenParraOveride\d\d?\d?', '',
                       paragraph.get('class', default='')))
    # remove filename for internal hyperlinks
    # inDesign_file_name = os.path.basename(input_file_name)
    inDesign_file_name = Path(input_file_name).name
    all_links = input_root.xpath('//a')
    for link in all_links:
        if 'href' in link.attrib:
            link.attrib['href'] = link.attrib['href'].replace(
                inDesign_file_name, '')

    # there are 3 paragraph style with hanging indednts that must be manipulated
    for paragraph in input_root.xpath(
            '//p[@class="paraMotionSub-Paragraph" or '
            '@class="paraMotionSub-Sub-Paragraph" or '
            '@class="paraMotionSub-Sub-Sub-Paragraph"][text()]'):
        try:
            split_on_tab = paragraph.text.split('\u0009', 1)
            span_hanging = Element('span')
            span_hanging.set('class', 'hanging1')
            span_hanging.tail = split_on_tab[1]
            span_hanging.text = split_on_tab[0]
            paragraph.append(span_hanging)
            paragraph.text = ''
        except IndexError:
            # dont do anything if there is no tab
            pass

    # sort out all the bullets
    bullets = input_root.xpath('//span[@class="pythonFindBullet"]')
    for bullet in bullets:
        bullet.drop_tree()
        # bullet.text = ""
        # # also turn the strong to a span. This is for FBA where there are tabs between the time and the rest.
        # next_strong_t = bullet.getnext()
        # if iselement(next_strong_t) and next_strong_t.tag == 'strong':
        #     # now check that there is a bold class and the next char is a tab
        #     if next_strong_t.get('class') == 'Bold' and next_strong_t.tail and next_strong_t.tail[0] == '\u0009':
        #         next_strong_t.tag = 'span'
        #         # next_strong_t.attrib.pop('class', None)
        #         # next_strong_t.attrib['style'] = 'display : block; float : left; width : 5.7em; height : 1em;'

    # sort the numbers
    numbers = input_root.xpath('//p[@class="paraQuestion"]/span[1]')
    for number in numbers:
        # cosider changing this in InDesign
        number.attrib['class'] = 'charBallotNumber'
        new_span = Element('span')
        new_span.classes.add('number-span')
        # new_span = html.fromstring('<span style="display : block; float : left; width : 2.1em; height : 1em;"></span>')
        number_parent = number.getparent()
        new_span.append(number)
        number_parent.insert(0, new_span)

    # sort ministerial statements
    statements = input_root.xpath(
        '//p[@class="paraMinisterialStatement"]/span[1]')
    for statement in statements:
        statement.attrib['class'] = 'charItemNumber'
        statement_tail_text = statement.tail
        statement.tail = ''
        new_span = Element('span')
        new_span.classes.add('number-span')
        # new_span = html.fromstring('<span style="display : block; float : left; width : 2.1em; height : 1em;"></span>')
        new_span.tail = statement_tail_text
        number_parent = statement.getparent()
        new_span.append(statement)
        number_parent.insert(0, new_span)

    # sort the front page tables
    front_page_tables = input_root.xpath('//table[@class="Front-Page-Table"]')
    for table in front_page_tables:
        # added as a result of an accessibility audit
        table.set('role', 'presentation')
    front_page_table_colgroups = input_root.xpath(
        '//table[@class="Front-Page-Table"]/colgroup')
    for colgroup in front_page_table_colgroups:
        colgroup[0].attrib.pop("class", None)
        colgroup[0].attrib['width'] = '24%'
        colgroup[1].attrib.pop("class", None)
        colgroup[1].attrib['width'] = '76%'

    # sort motion sponsor groups
    sponsor_groups_xpath = '//p[@class="paraMotionSponsorGroup"]' \
                           '|//p[@class="MotionAmmendmentSponsorGroup"]' \
                           '|//p[@class="MotionAmmendmentSponsorGroup"]/span' \
                           '|//p[@class="A2A-SponsorGroup"]'
    sponsor_groups = input_root.xpath(sponsor_groups_xpath)
    for sponsor_group in sponsor_groups:
        # print(html.tostring(sponsor_group))
        if not sponsor_group.text:
            continue
        sponsor_group.classes.add('row')
        # split text on the tab character (InDesign puts in)
        sponosr_names = sponsor_group.text.split('\u0009')
        sponsor_group.text = None
        for sponosr_name in sponosr_names:
            sponsor_span = SubElement(sponsor_group, 'span')
            sponsor_span.classes.update(('col-12', 'col-sm-6', 'col-lg-4'))
            sponsor_span.text = sponosr_name

    # change FBA location to .heading-level-3
    # for fba_location_heading in input_root.xpath('//*[@class="FbaLocation"]'):
    #     fba_location_heading.classes.discard('FbaLocation')
    #     fba_location_heading.classes.add('heading-level-3')

    # <strong class="Bold"> is overkill
    for strong_ele in input_root.xpath('//strong'):
        strong_ele.classes.discard('Bold')

    # dont need <span class="Hyperlink"> in a <a>
    for span in input_root.xpath('//a/span[@class="Hyperlink"]'):
        span.drop_tag()

    # seems like sometimes there are empty span.charStandingOrderReference
    for span in input_root.xpath(
            '//span[@class="charStandingOrderReference"]'):
        if not span.text or span.text.isspace():
            span.drop_tag()

    # Front-Page-Table doesnt need to be on the table the row and the td
    for tr in input_root.xpath(
            '//table[@class="Front-Page-Table"]//tr[@class="Front-Page-Table"]'
    ):
        tr.classes.discard('Front-Page-Table')
        for child in tr.iterchildren('td', 'th'):
            child.classes.discard('Front-Page-Table')

    heading_tags = ['h6', 'h5', 'h4', 'h3', 'h2', 'h1']

    for i, heading_tag in enumerate(heading_tags):
        # dont do anything if i ==0 because we have h6
        if i != 0:
            # we will replace heading_tag with heading_tags[i-1]
            # so h1 -> h2, h2 -> h3 etc.
            # print('here')
            new_heading_tag = heading_tags[i - 1]
            for heading in input_root.xpath(f'//{heading_tag}'):
                heading.tag = new_heading_tag

    # return the modified input html root element
    return input_root