Example #1
0
def instrument_location(instrument, location, args):
    def massage():
        return nodes_from_path_string(instrument.get_tree(), link_to_canonical(location))
    try:
        tree = nodes_from_path_string(instrument.get_tree(), location)
        if len(tree) == 1 and tree[0] == instrument.get_tree():
            raise CustomException('try again')
    except CustomException:
        tree = massage()
    full_location, _, path = generate_path_string(tree[0])
    tree = cull_tree(tree)
    return {
        'html_content': etree.tostring(tohtml(tree), encoding='UTF-8', method="html"),
        'title': instrument.title,
        'full_title': full_location,
        'document_id': instrument.id,
        'doc_type': 'instrument',
        "latest": instrument.attributes['latest'],
        "path": instrument.attributes['path'],
        "date_as_at_str": format_govt_date(instrument.attributes['date_as_at']),
        'format': 'fragment',
        'query': {
            'doc_type': 'instrument',
            'document_id': instrument.id,
            'find': 'location',
            'location': path
        }
    }
Example #2
0
def case_preview():
    file = request.files['file']
    filename = str(uuid.uuid4())+'.pdf'
    location = os.path.join('/tmp', filename)
    request.files['file'].save(location)
    case = etree.fromstring(process_case(location, debug=False))
    result = tohtml(case, xslt['case'])
    os.unlink(location)
    return etree.tostring(result, encoding='UTF-8', method="html")
Example #3
0
def process_contents(id, tree, db=None):
    with (db or get_db()).cursor() as cur:
        contents = etree.tostring(tohtml(tree,
                                         os.path.join('xslt',
                                                      'contents.xslt')),
                                  encoding='UTF-8',
                                  method="html")
        query = """UPDATE documents d SET contents=  %(contents)s
                    WHERE d.id =  %(id)s """
        cur.execute(query, {'id': id, 'contents': contents})
    (db or get_db()).commit()
    return contents
Example #4
0
    def test_html_transform(self):
        path = 'tests/instruments'
        sub = re.compile('\W')
        with app.test_request_context():
            for test_file in os.listdir(path):
                if test_file.endswith('html'):
                    print test_file
                    result = tohtml(
                        etree.parse(
                            os.path.join(path,
                                         test_file.replace('.html', '.xml'))))
                    expected = etree.parse(
                        open(os.path.join(path, test_file)),
                        parser=etree.HTMLParser()).xpath('.//body/div[1]')[0]
                    # findall much faster than xpath, but no 'or'
                    remove = expected.findall(
                        './/div[@class="actbodylastpage"]') + expected.findall(
                            './/div[@class="contents"]')
                    for r in remove:
                        r.getparent().remove(r)
                    #xpath = './/div[@class="prov"]|.//div[@class="form"]'
                    xpath = ".//div[contains(concat(' ', @class, ' '), ' part ')]"
                    results = result.xpath(xpath)
                    for i, seg in enumerate(expected.xpath(xpath)):
                        expected_seg = sub.sub(
                            '',
                            etree.tostring(seg,
                                           method="text",
                                           encoding="utf-8"))
                        #print expected_seg
                        if len(results) <= i:
                            print etree.tostring(seg,
                                                 method="text",
                                                 encoding="utf-8")
                        result_seg = sub.sub(
                            '',
                            etree.tostring(results[i],
                                           method="text",
                                           encoding="utf-8"))
                        x = [
                            i for i in xrange(len(result_seg))
                            if i >= len(expected_seg)
                            or result_seg[i] != expected_seg[i]
                        ]
                        if len(x):
                            print result_seg[x[0] - 10:x[0] + 100]
                            print expected_seg[x[0] - 10:x[0] + 100]

                        self.assertEqual(result_seg, expected_seg)
Example #5
0
 def render(self):
     xml = etree.Element('catalex-def-para')
     for result in self.results:
         if 'context' in result:
             xml.append(etree.fromstring(result['context']))
         xml.append(etree.fromstring(result['xml']))
         if 'src' in result:
             xml.append(etree.fromstring(result['src']))
     html = etree.tostring(tohtml(xml, os.path.join('xslt', 'transform_def.xslt')), encoding='UTF-8', method="html")
     return {
         'full_word': self.full_word,
         'html': html,
         'expiry_tags': self.expiry_tags,
         'id': self.id,
         'keys': list(self.keys),
         'priority': self.priority
     }
Example #6
0
def instrument_preview(instrument):
    preview = limit_tree_size(instrument.get_tree())
    return {
        'html_content': etree.tostring(tohtml(preview), encoding='UTF-8', method="html"),
        'title': instrument.title,
        'full_title': instrument.title,
        'document_id': instrument.id,
        'doc_type': 'instrument',
        "latest": instrument.attributes['latest'],
        "path": instrument.attributes['path'],
        "date_as_at_str": format_govt_date(instrument.attributes['date_as_at']),
        'format': 'preview',
        'query': {
            'doc_type': 'instrument',
            'document_id': instrument.id,
            'find': 'preview'
        }
    }
Example #7
0
def instrument_full(instrument, args={}):
    "who doesn't love magic numbers?"
    if current_app.config.get('USE_SKELETON') and (instrument.length > 100000 or args.get('highlight')):
        return instrument_skeleton_response(instrument, args)

    return {
        'html_content': etree.tostring(tohtml(instrument.get_tree()), encoding='UTF-8', method="html"),
        'title': instrument.title,
        'full_title': instrument.title,
        'document_id': instrument.id,
        'doc_type': 'instrument',
        "latest": instrument.attributes['latest'],
        "path": instrument.attributes['path'],
        "date_as_at_str": format_govt_date(instrument.attributes['date_as_at']),
        'format': 'full',
        'query': {
            'doc_type': 'instrument',
            'document_id': instrument.id,
            'find': 'full'
        }
    }
Example #8
0
def instrument_govt_location(instrument, id, link_text, args):
    tree = decide_govt_or_path(instrument.get_tree(), id, link_text)
    full_location, _, location = generate_path_string(tree[0])
    tree = cull_tree(tree)
    return {
        'html_content': etree.tostring(tohtml(tree), encoding='UTF-8', method="html"),
        'title': instrument.title,
        'full_title': full_location,
        'document_id': instrument.id,
        'doc_type': 'instrument',
        "latest": instrument.attributes['latest'],
        "path": instrument.attributes['path'],
        "date_as_at_str": format_govt_date(instrument.attributes['date_as_at']),
        'format': 'fragment',
        'query': {
            'doc_type': 'instrument',
            'document_id': instrument.id,
            'find': 'govt_location',
            'govt_location': id
        }
    }
Example #9
0
 def test_partials(self):
     for f in [
             f for f in os.listdir('tests/partial_instruments')
             if f.endswith('.xml')
     ]:
         print f
         with codecs.open(os.path.join('tests/partial_instruments', f),
                          encoding='utf-8') as fp:
             result = tohtml(etree.fromstring(fp.read(),
                                              self.parser)).getroot()
         with codecs.open(os.path.join('tests/partial_instruments',
                                       f.replace('.xml', '.html')),
                          encoding='utf-8') as fp:
             #with open(os.path.join('tests/partial_instruments', f.replace('.xml', '.html'))) as fp:
             #expected = html.fromstring(re.sub(ur'\xe2\x80\x99', "'", fp.read(), flags=re.UNICODE),
             expected = html.fromstring(ugly_replace(fp.read()),
                                        parser=etree.HTMLParser(
                                            remove_blank_text=True,
                                            encoding="utf-8"))
         self.assertTrue(
             xml_compare(result, expected, print_error, do_attr=False))
Example #10
0
def process_heights(id, tree, version, db=None):
    html = tohtml(tree)
    extra_formatting(tree, version)
    parts = False
    skeleton = etree.tostring(html, encoding='UTF-8', method="html")
    for i, div in enumerate(
            html.xpath(
                './/div[@class="prov" or @class="schedule"][not(ancestor::div[@class="prov"] or ancestor::div[@class="schedule"] or ancestor::div[@class="amend"])]'
            )):
        div.attrib['data-hook'] = '%d' % i
        parts = True
    """ super expensive """
    heights = measure_heights(
        etree.tostring(html, encoding='UTF-8', method="html"))
    db = db or get_db()
    if parts:
        with db.cursor() as cur:
            query = """UPDATE documents d SET heights = %(heights)s
                        WHERE d.id =  %(id)s """
            cur.execute(query, {'id': id, 'heights': json.dumps(heights)})
    db.commit()
    return heights
Example #11
0
def process_skeleton(id, tree, version, db=None):
    """ whoever wrote this is an asshole """
    """ don't check git blame  """
    parts = []
    extra_formatting(tree, version)
    html = tohtml(tree)
    max_size = 20000
    min_size = 200
    i = [0]

    def wrap(tag, nodes):
        string = ''.join([
            etree.tostring(n, encoding='UTF-8', method="html") for n in nodes
        ])
        if len(string) < min_size:
            return nodes
        div = etree.Element(tag)
        div.attrib['data-hook'] = '%d' % len(parts)
        div[:] = nodes
        parts.append(string)
        return [div]

    def depth(node):
        running = 0
        to_join = []
        results = []
        for j, n in list(enumerate(node)):
            length = len(etree.tostring(n))
            if n.tag == 'table':
                if len(to_join):
                    results += wrap(n.tag, to_join)
                    to_join = []
                results += wrap('div', [n])

            elif length > max_size:
                if len(to_join):
                    results += wrap('div', to_join)
                    to_join = []
                running = 0
                results += [depth(n)]
            else:
                if running + length > max_size:
                    results += wrap('div', to_join)
                    to_join = [n]
                    running = 0
                else:
                    running += len(etree.tostring(n))
                    to_join.append(n)
        if len(to_join):
            results += wrap('div', to_join)
        node[:] = results
        return node

    #depth(html.getroot())
    for i, div in enumerate(
            html.xpath(
                './/div[@class="prov" or @class="schedule"][not(ancestor::div[@class="prov"] or ancestor::div[@class="schedule"] or ancestor::div[@class="amend"])]'
            )):
        # if too big, try to gut
        title = ''
        try:
            for br in div.xpath('.//br'):
                br.tail = ' ' + (br.tail or '')
            if div.attrib['class'] == 'prov':
                label = div.xpath('.//h5[@class="prov labelled"]')[0]
                for br in label.xpath('.//span[@class="label"]'):
                    br.tail = ' ' + (br.tail or '')
                title = etree.tostring(label, encoding='UTF-8', method="text")
            else:
                label = deepcopy(div.xpath('.//td[@class="header"]')[0])
                for br in label.xpath('.//br'):
                    br.tail = ' ' + (br.tail or '')
                title = etree.tostring(label, encoding='UTF-8', method="text")
        except IndexError:
            pass
        parts.append(
            (title, etree.tostring(div, encoding='UTF-8', method="html")))
        div.attrib['data-hook'] = '%d' % i
    """ super expensive """
    heights = measure_heights(
        etree.tostring(html, encoding='UTF-8', method="html"))
    """ Now remove all the parts' children, saving things we may need to look up """
    for el in html.xpath('.//*[@data-hook]'):
        ids = ';'.join(map(lambda e: e.attrib['id'], el.xpath('.//*[@id]')))
        locations = ';'.join(
            map(lambda e: e.attrib['data-location'],
                el.xpath('.//*[@data-location]')))
        el.attrib['data-child-ids'] = ids
        el.attrib['data-child-locations'] = locations
        el[:] = []

    skeleton = etree.tostring(html, encoding='UTF-8', method="html")

    db = db or get_db()
    with db.cursor() as cur:
        query = """UPDATE documents d SET skeleton =  %(skeleton)s
                    WHERE d.id =  %(id)s """
        cur.execute(query, {'id': id, 'skeleton': skeleton})

    if len(parts):
        with db.cursor() as cur:
            cur.execute(
                'DELETE FROM document_parts WHERE document_id = %(id)s',
                {'id': id})
            args_str = ','.join(
                cur.mogrify("(%s, %s, %s, %s)", (id, i, p[0], p[1]))
                for i, p in enumerate(parts))
            cur.execute(
                'INSERT INTO document_parts (document_id, num, title, data) VALUES '
                + args_str)

    db.commit()
    try:
        insert_instrument_es(id, db)
    except Exception, e:
        current_app.logger.error('Could not load %d into elasticsearch' % id)
        current_app.logger.error(e)