Example #1
0
 def __init__(self, en_list=False):
     if en_list:
         self._listhandler = hanreifetch.EnListHandler()
         self._jikenparser = hanreifetch.EnJikenParser()
     else:
         self._listhandler = hanreifetch.ListHandler()
         self._jikenparser = hanreifetch.JikenParser()
     self._in_english = en_list
def create_hanrei_elem():
    import HTMLParser
    hp = HTMLParser.HTMLParser()
    hanrei_elem = hanreifetch.JikenParser().create_hanrei_element(
        sample_jiken(), hanreiid='52442')
    expected_elem = etree.fromstring(sample_jiken_xml().encode(
        hanreifetch.XML_ENCODING)).findall(u'./Hanrei')[0]
    hanrei_xml = hp.unescape(etree.tostring(hanrei_elem,
                                            pretty_print=True)).strip()
    expected_xml = hp.unescape(etree.tostring(expected_elem,
                                              pretty_print=True)).strip()
    assert ([hline.strip() for hline in hanrei_xml.split(u'\n')
             ] == [eline.strip() for eline in expected_xml.split(u'\n')])
def parse_single_attribute():
    attr_block_html = (u'<body class="dummy">'
                       u'<div class="dlist">'
                       u'  <div>'
                       u'    <div class="list4_top">attr1</div>'
                       u'    <div class="list5_top">'
                       u'      value1'
                       u'    </div>'
                       u'  </div>'
                       u'  <div class="clear"></div>'
                       u'  <div>'
                       u'    <div class="list4">attr2</div>'
                       u'    <div class="list5_long">'
                       u'      value2'
                       u'    </div>'
                       u'  </div>'
                       u'  <div class="clear"></div>'
                       u'</div>'
                       u'<div class="dlist">'
                       u'  <div>'
                       u'    <div class="list4_pdf">attr3</div>'
                       u'    <div class="list5_long">'
                       u'      value3'
                       u'    </div>'
                       u'  </div>'
                       u'  <div class="clear"></div>'
                       u'</div>'
                       u'</body>')
    attrs = list(hanreifetch.JikenParser().hanrei_attrs_from(attr_block_html))
    assert attrs == [
        (
            u'attr1',
            u'value1',
        ),
        (
            u'attr2',
            u'value2',
        ),
        (
            u'attr3',
            u'value3',
        ),
    ]
def create_hanrei_xml():
    hanrei_xml = hanreifetch.JikenParser().create_xml_from([(sample_jiken(),
                                                             '52442')])
    expected_xml = sample_jiken_xml()
    assert hanrei_xml == expected_xml
def full_text_from_web():
    full_text = hanreifetch.JikenParser().get_full_text(
        SAMPLE_HANREI_PDF_ORIGIN)
    expected_text = sample_hanrei_full_text()
    assert full_text == expected_text
def create_hanrei_struct():
    hanrei = hanreifetch.JikenParser().create_struct_from(sample_jiken())
    expected_attrmap = sample_jiken_struct_attrmap()
    for key in expected_attrmap:
        assert getattr(hanrei, key) == expected_attrmap[key]
def detect_all_hanrei_attrs():
    detected = list(hanreifetch.JikenParser().hanrei_attrs_from(
        sample_jiken()))
    expected_attrs = sample_jiken_attr_pairs()
    assert set(detected) == set(expected_attrs)