def clean(self, element): cleanElement = None dropEmpty = ('span', 'p', 'div') downloadDir = self.task.getProperty('download') if 'img' == element.tag: src = urlparse.urljoin(self.url, element.attrib['src']) file, info = urllib.urlretrieve(src) url = urlparse.urlparse(src) disposition = info.getheader('Content-Disposition') filename = None if disposition: type, filename = disposition.split(';') key, filename = filename.split('=') filename = filename.strip('"') if not filename: filename = os.path.basename(file) splitf = filename.split('.') lenf = len(splitf) ext = splitf.pop() if lenf < 2 or info.subtype != ext: filename = '.'.join((filename, info.subtype)) element.attrib['src'] = filename os.rename(file, '/'.join((downloadDir, filename))) #moin specific hack for now if 'a' == element.tag and '/Category' in element.attrib['href']: pass elif element.tag not in dropEmpty \ or bool(element.getchildren()) \ or (bool(element.text) \ and bool(element.text.strip())): cleanElement = Element(element.tag) cleanElement.text = element.text stripattribs = ('class', 'style', 'id') for a in element.attrib: if a not in stripattribs: cleanElement.set(a, element.attrib[a]) for e in element.getchildren(): clean = (self.clean(e)) if clean is not None: cleanElement.append(clean) return cleanElement
def view_selection(self, req, resp, url): """ View the highlighted selector (from `action_view`) """ from deliverance.selector import Selector doc = document_fromstring(resp.body) el = Element('base') el.set('href', posixpath.dirname(url) + '/') doc.head.insert(0, el) selector = Selector.parse(req.GET['selector']) dummy_type, elements, dummy_attributes = selector(doc) if not elements: template = self._not_found_template else: template = self._found_template all_elements = [] els_in_head = False for index, el in enumerate(elements): el_in_head = self._el_in_head(el) if el_in_head: els_in_head = True anchor = 'deliverance-selection' if index: anchor += '-%s' % index if el.get('id'): anchor = el.get('id') ## FIXME: is a <a name> better? if not el_in_head: el.set('id', anchor) else: anchor = None ## FIXME: add :target CSS rule ## FIXME: or better, some Javascript all_elements.append((anchor, el)) if not el_in_head: style = el.get('style', '') if style: style += '; ' style += '/* deliverance */ border: 2px dotted #f00' el.set('style', style) else: el.set('DELIVERANCE-MATCH', '1') def highlight(html_code): """Highlights the given code (for use in the template)""" if isinstance(html_code, _Element): html_code = tostring(html_code) return html(pygments_highlight(html_code, HtmlLexer(), HtmlFormatter(noclasses=True))) def format_tag(tag): """Highlights the lxml HTML tag""" return highlight(tostring(tag).split('>')[0]+'>') def wrap_html(html, width=100): if isinstance(html, _Element): html = tostring(html) lines = html.splitlines() new_lines = [] def wrap_html_line(line): if len(line) <= width: return [line] match_trail = re.search(r'^[^<]*</.*?>', line, re.S) if match_trail: result = [match_trail.group(0)] result.extend(wrap_html_line(line[match_trail.end():])) return result match1 = re.search(r'^[^<]*<[^>]*>', line, re.S) match2 = re.search(r'<[^>]*>[^<>]*$', line, re.S) if not match1 or not match2: return [line] result = [match1.group(0)] result.extend(wrap_html_line(line[match1.end():match2.start()])) result.append(match2.group(0)) return result for line in lines: new_lines.extend(wrap_html_line(line)) return '\n'.join(new_lines) def mark_deliv_match(highlighted_text): result = re.sub(r'(?:<[^/][^>]*>)*<.*?DELIVERANCE-MATCH=.*?>(?:</[^>]*>)*', lambda match: r'<b style="background-color: #ff8">%s</b>' % match.group(0), unicode(highlighted_text), re.S) return html(result) text = template.substitute( base_url=url, els_in_head=els_in_head, doc=doc, elements=all_elements, selector=selector, format_tag=format_tag, highlight=highlight, wrap_html=wrap_html, mark_deliv_match=mark_deliv_match) message = fromstring( self._message_template.substitute(message=text, url=url)) if doc.body.text: message.tail = doc.body.text doc.body.text = '' doc.body.insert(0, message) text = tostring(doc) return Response(text)
def view_selection(self, req, resp, url): """ View the highlighted selector (from `action_view`) """ from deliverance.selector import Selector doc = document_fromstring(resp.body) el = Element('base') el.set('href', posixpath.dirname(url) + '/') doc.head.insert(0, el) selector = Selector.parse(req.GET['selector']) dummy_type, elements, dummy_attributes = selector(doc) if not elements: template = self._not_found_template else: template = self._found_template all_elements = [] els_in_head = False for index, el in enumerate(elements): el_in_head = self._el_in_head(el) if el_in_head: els_in_head = True anchor = 'deliverance-selection' if index: anchor += '-%s' % index if el.get('id'): anchor = el.get('id') ## FIXME: is a <a name> better? if not el_in_head: el.set('id', anchor) else: anchor = None ## FIXME: add :target CSS rule ## FIXME: or better, some Javascript all_elements.append((anchor, el)) if not el_in_head: style = el.get('style', '') if style: style += '; ' style += '/* deliverance */ border: 2px dotted #f00' el.set('style', style) else: el.set('DELIVERANCE-MATCH', '1') def highlight(html_code): """Highlights the given code (for use in the template)""" if isinstance(html_code, _Element): html_code = tostring(html_code) return html( pygments_highlight(html_code, HtmlLexer(), HtmlFormatter(noclasses=True))) def format_tag(tag): """Highlights the lxml HTML tag""" return highlight(tostring(tag).split('>')[0] + '>') def wrap_html(html, width=100): if isinstance(html, _Element): html = tostring(html) lines = html.splitlines() new_lines = [] def wrap_html_line(line): if len(line) <= width: return [line] match_trail = re.search(r'^[^<]*</.*?>', line, re.S) if match_trail: result = [match_trail.group(0)] result.extend(wrap_html_line(line[match_trail.end():])) return result match1 = re.search(r'^[^<]*<[^>]*>', line, re.S) match2 = re.search(r'<[^>]*>[^<>]*$', line, re.S) if not match1 or not match2: return [line] result = [match1.group(0)] result.extend(wrap_html_line( line[match1.end():match2.start()])) result.append(match2.group(0)) return result for line in lines: new_lines.extend(wrap_html_line(line)) return '\n'.join(new_lines) def mark_deliv_match(highlighted_text): result = re.sub( r'(?:<[^/][^>]*>)*<.*?DELIVERANCE-MATCH=.*?>(?:</[^>]*>)*', lambda match: r'<b style="background-color: #ff8">%s</b>' % match.group(0), unicode(highlighted_text), re.S) return html(result) text = template.substitute(base_url=url, els_in_head=els_in_head, doc=doc, elements=all_elements, selector=selector, format_tag=format_tag, highlight=highlight, wrap_html=wrap_html, mark_deliv_match=mark_deliv_match) message = fromstring( self._message_template.substitute(message=text, url=url)) if doc.body.text: message.tail = doc.body.text doc.body.text = '' doc.body.insert(0, message) text = tostring(doc) return Response(text)
def split_and_output(input_root, template_file_name, input_file_name, output_folder=''): output_tree = html.parse(template_file_name) # output_root = output_tree.getroot() # put element lists in dict with file_lable as the key file_lables_element_lists = {'op': [], 'new_fb': [], 'an': []} # select all the paragraphs etc within the top levle divs paragraph_elements = input_root.xpath('//body/div/*') list_to_add_to = file_lables_element_lists['op'] # look through all the paragraph elemets and find out if any are Annoncements etc for paragraph_element in paragraph_elements: # if (paragraph_element.get('class') == 'DocumentTitle' # and 'PART 2' in paragraph_element.text_content().upper()): # # start new list for future business # list_to_add_to = file_lables_element_lists['new_fb'] list_to_add_to.append(paragraph_element) # build up output trees for file_lable, element_list in file_lables_element_lists.items(): if len(element_list) != 0: # copy the template tree and add elements needed for this section temp_output_tree = deepcopy(output_tree) # temp_output_tree = html.parse(template_file_name) temp_output_root = temp_output_tree.getroot() # change the title if file_lable == 'op': title_text = 'Order Paper for ' + DATES.sitting_date_medium h1_text = 'Order Paper for ' + DATES.sitting_date_long # elif file_lable == 'new_fb': # title_text = 'Future Business as of ' + DATES.sitting_date_medium # h1_text = 'Future Business as of ' + DATES.sitting_date_long temp_output_root.xpath('//h1[@id="mainTitle"]')[0].text = h1_text temp_output_root.xpath('//head/title')[0].text = title_text # get the position (in the template) where we will inject html (from the input) code_injection_point = temp_output_root.xpath( '//div[@id="content-goes-here"]')[0] for element in element_list: # remove Future Business heading from start of part 2 if 'paraChamberSummaryHeading' in element.classes: if element.text_content().lower() == 'future business': continue # remove the docuemnt headings from the html i.e. part 1 head if 'DocumentTitle' in element.classes: text_content = element.text_content().lower() h2 = Element('h2') h2.set('class', 'OP-heading-outdent') if 'part 1' in text_content: h2.text = 'Part 1: Business Today' code_injection_point.append(h2) elif 'part 2' in text_content: h2.text = 'Part 2: Future Business' code_injection_point.append(h2) else: code_injection_point.append(element) # Add IDs and perminant ancors to the html # Added at the request of IDMS # need to get all the heading elements xpath = '//h1|//h2|//h3|//h4|//h5|//h6|//*[@class="paraBusinessItemHeading"]' \ '|//*[@class="paraBusinessItemHeading-bulleted"]|//*[@class="FbaLocation"]' headings = temp_output_root.xpath(xpath) for i, heading in enumerate(headings): # generate id text id_text = f'{DATES.sitting_date_compact}-{i}' if heading.get('id', default=None): heading.set('name', heading.get('id')) heading.set('id', id_text) # parmalink_span = SubElement(heading, 'span') # parmalink_span.set('class', 'perma-link') # anchor = SubElement(parmalink_span, 'a') anchor = SubElement(heading, 'a') permalink_for = 'Permalink for ' + heading.text_content() anchor.set('href', '#' + id_text) anchor.set('aria-label', 'Anchor') anchor.set('title', permalink_for) anchor.set('data-anchor-icon', '§') anchor.set('class', 'anchor-link') # create the tables of contents # This will be overridden by tocbot. # We still want a ToC even if JavaScript is dissabled... # find where to put the Toc nav_xpath_results = temp_output_root.xpath('//nav[@id="toc"][1]') # look for all the h2's # // Where to grab the headings to build the table of contents. # contentSelector: '.js-toc-content' h2s = temp_output_root.xpath( '//*[contains(@class, "js-toc-content")]//h2') if len(nav_xpath_results): toc_injection_point = nav_xpath_results[0] ol = SubElement(toc_injection_point, 'ol') ol.set('class', 'toc-list') for h2 in h2s: li = SubElement(ol, 'li') li.set('class', 'toc-list-item') a = SubElement(li, 'a') a.set('href', '#' + h2.get('id', '')) a.set('class', 'toc-link') a.text = h2.text_content() else: print('no element') # itterate through tree and remove CR from tail and text for element in temp_output_root.iter(): if element.tail: element.tail = element.tail.replace('\r', '') if element.text: element.text = element.text.replace('\r', '') # write out the output html files # outputfile_name = os.path.join(os.path.dirname(input_file_name), # file_lable + DATES.sitting_date_compact[2:] + fileextension) outputfile_name = f'{file_lable}{DATES.sitting_date_compact[2:]}{ fileextension}' if output_folder: outputfile_path = Path(output_folder).joinpath(outputfile_name) # print(outputfile_path) else: outputfile_path = Path(input_file_name).parent.joinpath( outputfile_name) # created element tree so we can use write method # temp_output_tree = ElementTree(temp_output_root) temp_output_tree.write(str(outputfile_path), doctype=DOCTYPE, encoding='UTF-8', method="html", xml_declaration=False) print(f'{file_lable} file is at:\t{outputfile_path}')
def massarge_input_file(input_file_name): # test for bad classes # try: bad_classes(input_file_name) # except: # # we dont reall NEED to chaeck for bad classes so wond do anything here # show_error('There was a problem when checking for bad classes.') input_root = html.parse(input_file_name).getroot() # remove the contents div contents_div = input_root.xpath('body/div[@class="Contents-Box"]') if len(contents_div) > 0: contents_div[0].getparent().remove(contents_div[0]) # remove all the _idGenParaOverrides all_paragraphs = input_root.xpath('//p|//h1|//h2|//h3|//h4|//h5|//h6') for paragraph in all_paragraphs: if re.search(r' ?_idGenParraOveride\d\d?\d?', paragraph.get('class', default='')) is not None: print('override') paragraph.set( 'class', re.sub(r' ?_idGenParraOveride\d\d?\d?', '', paragraph.get('class', default=''))) # remove filename for internal hyperlinks # inDesign_file_name = os.path.basename(input_file_name) inDesign_file_name = Path(input_file_name).name all_links = input_root.xpath('//a') for link in all_links: if 'href' in link.attrib: link.attrib['href'] = link.attrib['href'].replace( inDesign_file_name, '') # there are 3 paragraph style with hanging indednts that must be manipulated for paragraph in input_root.xpath( '//p[@class="paraMotionSub-Paragraph" or ' '@class="paraMotionSub-Sub-Paragraph" or ' '@class="paraMotionSub-Sub-Sub-Paragraph"][text()]'): try: split_on_tab = paragraph.text.split('\u0009', 1) span_hanging = Element('span') span_hanging.set('class', 'hanging1') span_hanging.tail = split_on_tab[1] span_hanging.text = split_on_tab[0] paragraph.append(span_hanging) paragraph.text = '' except IndexError: # dont do anything if there is no tab pass # sort out all the bullets bullets = input_root.xpath('//span[@class="pythonFindBullet"]') for bullet in bullets: bullet.drop_tree() # bullet.text = "" # # also turn the strong to a span. This is for FBA where there are tabs between the time and the rest. # next_strong_t = bullet.getnext() # if iselement(next_strong_t) and next_strong_t.tag == 'strong': # # now check that there is a bold class and the next char is a tab # if next_strong_t.get('class') == 'Bold' and next_strong_t.tail and next_strong_t.tail[0] == '\u0009': # next_strong_t.tag = 'span' # # next_strong_t.attrib.pop('class', None) # # next_strong_t.attrib['style'] = 'display : block; float : left; width : 5.7em; height : 1em;' # sort the numbers numbers = input_root.xpath('//p[@class="paraQuestion"]/span[1]') for number in numbers: # cosider changing this in InDesign number.attrib['class'] = 'charBallotNumber' new_span = Element('span') new_span.classes.add('number-span') # new_span = html.fromstring('<span style="display : block; float : left; width : 2.1em; height : 1em;"></span>') number_parent = number.getparent() new_span.append(number) number_parent.insert(0, new_span) # sort ministerial statements statements = input_root.xpath( '//p[@class="paraMinisterialStatement"]/span[1]') for statement in statements: statement.attrib['class'] = 'charItemNumber' statement_tail_text = statement.tail statement.tail = '' new_span = Element('span') new_span.classes.add('number-span') # new_span = html.fromstring('<span style="display : block; float : left; width : 2.1em; height : 1em;"></span>') new_span.tail = statement_tail_text number_parent = statement.getparent() new_span.append(statement) number_parent.insert(0, new_span) # sort the front page tables front_page_tables = input_root.xpath('//table[@class="Front-Page-Table"]') for table in front_page_tables: # added as a result of an accessibility audit table.set('role', 'presentation') front_page_table_colgroups = input_root.xpath( '//table[@class="Front-Page-Table"]/colgroup') for colgroup in front_page_table_colgroups: colgroup[0].attrib.pop("class", None) colgroup[0].attrib['width'] = '24%' colgroup[1].attrib.pop("class", None) colgroup[1].attrib['width'] = '76%' # sort motion sponsor groups sponsor_groups_xpath = '//p[@class="paraMotionSponsorGroup"]' \ '|//p[@class="MotionAmmendmentSponsorGroup"]' \ '|//p[@class="MotionAmmendmentSponsorGroup"]/span' \ '|//p[@class="A2A-SponsorGroup"]' sponsor_groups = input_root.xpath(sponsor_groups_xpath) for sponsor_group in sponsor_groups: # print(html.tostring(sponsor_group)) if not sponsor_group.text: continue sponsor_group.classes.add('row') # split text on the tab character (InDesign puts in) sponosr_names = sponsor_group.text.split('\u0009') sponsor_group.text = None for sponosr_name in sponosr_names: sponsor_span = SubElement(sponsor_group, 'span') sponsor_span.classes.update(('col-12', 'col-sm-6', 'col-lg-4')) sponsor_span.text = sponosr_name # change FBA location to .heading-level-3 # for fba_location_heading in input_root.xpath('//*[@class="FbaLocation"]'): # fba_location_heading.classes.discard('FbaLocation') # fba_location_heading.classes.add('heading-level-3') # <strong class="Bold"> is overkill for strong_ele in input_root.xpath('//strong'): strong_ele.classes.discard('Bold') # dont need <span class="Hyperlink"> in a <a> for span in input_root.xpath('//a/span[@class="Hyperlink"]'): span.drop_tag() # seems like sometimes there are empty span.charStandingOrderReference for span in input_root.xpath( '//span[@class="charStandingOrderReference"]'): if not span.text or span.text.isspace(): span.drop_tag() # Front-Page-Table doesnt need to be on the table the row and the td for tr in input_root.xpath( '//table[@class="Front-Page-Table"]//tr[@class="Front-Page-Table"]' ): tr.classes.discard('Front-Page-Table') for child in tr.iterchildren('td', 'th'): child.classes.discard('Front-Page-Table') heading_tags = ['h6', 'h5', 'h4', 'h3', 'h2', 'h1'] for i, heading_tag in enumerate(heading_tags): # dont do anything if i ==0 because we have h6 if i != 0: # we will replace heading_tag with heading_tags[i-1] # so h1 -> h2, h2 -> h3 etc. # print('here') new_heading_tag = heading_tags[i - 1] for heading in input_root.xpath(f'//{heading_tag}'): heading.tag = new_heading_tag # return the modified input html root element return input_root