def instrument_location(instrument, location, args): def massage(): return nodes_from_path_string(instrument.get_tree(), link_to_canonical(location)) try: tree = nodes_from_path_string(instrument.get_tree(), location) if len(tree) == 1 and tree[0] == instrument.get_tree(): raise CustomException('try again') except CustomException: tree = massage() full_location, _, path = generate_path_string(tree[0]) tree = cull_tree(tree) return { 'html_content': etree.tostring(tohtml(tree), encoding='UTF-8', method="html"), 'title': instrument.title, 'full_title': full_location, 'document_id': instrument.id, 'doc_type': 'instrument', "latest": instrument.attributes['latest'], "path": instrument.attributes['path'], "date_as_at_str": format_govt_date(instrument.attributes['date_as_at']), 'format': 'fragment', 'query': { 'doc_type': 'instrument', 'document_id': instrument.id, 'find': 'location', 'location': path } }
def case_preview(): file = request.files['file'] filename = str(uuid.uuid4())+'.pdf' location = os.path.join('/tmp', filename) request.files['file'].save(location) case = etree.fromstring(process_case(location, debug=False)) result = tohtml(case, xslt['case']) os.unlink(location) return etree.tostring(result, encoding='UTF-8', method="html")
def process_contents(id, tree, db=None): with (db or get_db()).cursor() as cur: contents = etree.tostring(tohtml(tree, os.path.join('xslt', 'contents.xslt')), encoding='UTF-8', method="html") query = """UPDATE documents d SET contents= %(contents)s WHERE d.id = %(id)s """ cur.execute(query, {'id': id, 'contents': contents}) (db or get_db()).commit() return contents
def test_html_transform(self): path = 'tests/instruments' sub = re.compile('\W') with app.test_request_context(): for test_file in os.listdir(path): if test_file.endswith('html'): print test_file result = tohtml( etree.parse( os.path.join(path, test_file.replace('.html', '.xml')))) expected = etree.parse( open(os.path.join(path, test_file)), parser=etree.HTMLParser()).xpath('.//body/div[1]')[0] # findall much faster than xpath, but no 'or' remove = expected.findall( './/div[@class="actbodylastpage"]') + expected.findall( './/div[@class="contents"]') for r in remove: r.getparent().remove(r) #xpath = './/div[@class="prov"]|.//div[@class="form"]' xpath = ".//div[contains(concat(' ', @class, ' '), ' part ')]" results = result.xpath(xpath) for i, seg in enumerate(expected.xpath(xpath)): expected_seg = sub.sub( '', etree.tostring(seg, method="text", encoding="utf-8")) #print expected_seg if len(results) <= i: print etree.tostring(seg, method="text", encoding="utf-8") result_seg = sub.sub( '', etree.tostring(results[i], method="text", encoding="utf-8")) x = [ i for i in xrange(len(result_seg)) if i >= len(expected_seg) or result_seg[i] != expected_seg[i] ] if len(x): print result_seg[x[0] - 10:x[0] + 100] print expected_seg[x[0] - 10:x[0] + 100] self.assertEqual(result_seg, expected_seg)
def render(self): xml = etree.Element('catalex-def-para') for result in self.results: if 'context' in result: xml.append(etree.fromstring(result['context'])) xml.append(etree.fromstring(result['xml'])) if 'src' in result: xml.append(etree.fromstring(result['src'])) html = etree.tostring(tohtml(xml, os.path.join('xslt', 'transform_def.xslt')), encoding='UTF-8', method="html") return { 'full_word': self.full_word, 'html': html, 'expiry_tags': self.expiry_tags, 'id': self.id, 'keys': list(self.keys), 'priority': self.priority }
def instrument_preview(instrument): preview = limit_tree_size(instrument.get_tree()) return { 'html_content': etree.tostring(tohtml(preview), encoding='UTF-8', method="html"), 'title': instrument.title, 'full_title': instrument.title, 'document_id': instrument.id, 'doc_type': 'instrument', "latest": instrument.attributes['latest'], "path": instrument.attributes['path'], "date_as_at_str": format_govt_date(instrument.attributes['date_as_at']), 'format': 'preview', 'query': { 'doc_type': 'instrument', 'document_id': instrument.id, 'find': 'preview' } }
def instrument_full(instrument, args={}): "who doesn't love magic numbers?" if current_app.config.get('USE_SKELETON') and (instrument.length > 100000 or args.get('highlight')): return instrument_skeleton_response(instrument, args) return { 'html_content': etree.tostring(tohtml(instrument.get_tree()), encoding='UTF-8', method="html"), 'title': instrument.title, 'full_title': instrument.title, 'document_id': instrument.id, 'doc_type': 'instrument', "latest": instrument.attributes['latest'], "path": instrument.attributes['path'], "date_as_at_str": format_govt_date(instrument.attributes['date_as_at']), 'format': 'full', 'query': { 'doc_type': 'instrument', 'document_id': instrument.id, 'find': 'full' } }
def instrument_govt_location(instrument, id, link_text, args): tree = decide_govt_or_path(instrument.get_tree(), id, link_text) full_location, _, location = generate_path_string(tree[0]) tree = cull_tree(tree) return { 'html_content': etree.tostring(tohtml(tree), encoding='UTF-8', method="html"), 'title': instrument.title, 'full_title': full_location, 'document_id': instrument.id, 'doc_type': 'instrument', "latest": instrument.attributes['latest'], "path": instrument.attributes['path'], "date_as_at_str": format_govt_date(instrument.attributes['date_as_at']), 'format': 'fragment', 'query': { 'doc_type': 'instrument', 'document_id': instrument.id, 'find': 'govt_location', 'govt_location': id } }
def test_partials(self): for f in [ f for f in os.listdir('tests/partial_instruments') if f.endswith('.xml') ]: print f with codecs.open(os.path.join('tests/partial_instruments', f), encoding='utf-8') as fp: result = tohtml(etree.fromstring(fp.read(), self.parser)).getroot() with codecs.open(os.path.join('tests/partial_instruments', f.replace('.xml', '.html')), encoding='utf-8') as fp: #with open(os.path.join('tests/partial_instruments', f.replace('.xml', '.html'))) as fp: #expected = html.fromstring(re.sub(ur'\xe2\x80\x99', "'", fp.read(), flags=re.UNICODE), expected = html.fromstring(ugly_replace(fp.read()), parser=etree.HTMLParser( remove_blank_text=True, encoding="utf-8")) self.assertTrue( xml_compare(result, expected, print_error, do_attr=False))
def process_heights(id, tree, version, db=None): html = tohtml(tree) extra_formatting(tree, version) parts = False skeleton = etree.tostring(html, encoding='UTF-8', method="html") for i, div in enumerate( html.xpath( './/div[@class="prov" or @class="schedule"][not(ancestor::div[@class="prov"] or ancestor::div[@class="schedule"] or ancestor::div[@class="amend"])]' )): div.attrib['data-hook'] = '%d' % i parts = True """ super expensive """ heights = measure_heights( etree.tostring(html, encoding='UTF-8', method="html")) db = db or get_db() if parts: with db.cursor() as cur: query = """UPDATE documents d SET heights = %(heights)s WHERE d.id = %(id)s """ cur.execute(query, {'id': id, 'heights': json.dumps(heights)}) db.commit() return heights
def process_skeleton(id, tree, version, db=None): """ whoever wrote this is an asshole """ """ don't check git blame """ parts = [] extra_formatting(tree, version) html = tohtml(tree) max_size = 20000 min_size = 200 i = [0] def wrap(tag, nodes): string = ''.join([ etree.tostring(n, encoding='UTF-8', method="html") for n in nodes ]) if len(string) < min_size: return nodes div = etree.Element(tag) div.attrib['data-hook'] = '%d' % len(parts) div[:] = nodes parts.append(string) return [div] def depth(node): running = 0 to_join = [] results = [] for j, n in list(enumerate(node)): length = len(etree.tostring(n)) if n.tag == 'table': if len(to_join): results += wrap(n.tag, to_join) to_join = [] results += wrap('div', [n]) elif length > max_size: if len(to_join): results += wrap('div', to_join) to_join = [] running = 0 results += [depth(n)] else: if running + length > max_size: results += wrap('div', to_join) to_join = [n] running = 0 else: running += len(etree.tostring(n)) to_join.append(n) if len(to_join): results += wrap('div', to_join) node[:] = results return node #depth(html.getroot()) for i, div in enumerate( html.xpath( './/div[@class="prov" or @class="schedule"][not(ancestor::div[@class="prov"] or ancestor::div[@class="schedule"] or ancestor::div[@class="amend"])]' )): # if too big, try to gut title = '' try: for br in div.xpath('.//br'): br.tail = ' ' + (br.tail or '') if div.attrib['class'] == 'prov': label = div.xpath('.//h5[@class="prov labelled"]')[0] for br in label.xpath('.//span[@class="label"]'): br.tail = ' ' + (br.tail or '') title = etree.tostring(label, encoding='UTF-8', method="text") else: label = deepcopy(div.xpath('.//td[@class="header"]')[0]) for br in label.xpath('.//br'): br.tail = ' ' + (br.tail or '') title = etree.tostring(label, encoding='UTF-8', method="text") except IndexError: pass parts.append( (title, etree.tostring(div, encoding='UTF-8', method="html"))) div.attrib['data-hook'] = '%d' % i """ super expensive """ heights = measure_heights( etree.tostring(html, encoding='UTF-8', method="html")) """ Now remove all the parts' children, saving things we may need to look up """ for el in html.xpath('.//*[@data-hook]'): ids = ';'.join(map(lambda e: e.attrib['id'], el.xpath('.//*[@id]'))) locations = ';'.join( map(lambda e: e.attrib['data-location'], el.xpath('.//*[@data-location]'))) el.attrib['data-child-ids'] = ids el.attrib['data-child-locations'] = locations el[:] = [] skeleton = etree.tostring(html, encoding='UTF-8', method="html") db = db or get_db() with db.cursor() as cur: query = """UPDATE documents d SET skeleton = %(skeleton)s WHERE d.id = %(id)s """ cur.execute(query, {'id': id, 'skeleton': skeleton}) if len(parts): with db.cursor() as cur: cur.execute( 'DELETE FROM document_parts WHERE document_id = %(id)s', {'id': id}) args_str = ','.join( cur.mogrify("(%s, %s, %s, %s)", (id, i, p[0], p[1])) for i, p in enumerate(parts)) cur.execute( 'INSERT INTO document_parts (document_id, num, title, data) VALUES ' + args_str) db.commit() try: insert_instrument_es(id, db) except Exception, e: current_app.logger.error('Could not load %d into elasticsearch' % id) current_app.logger.error(e)