Example #1
def summary_pdf_downloader(session, href, text, base_url, node):
    the summary page has a few different pdfs linked directly from it:
    総務大臣届出分 - a breakdown of all the numbers
    総務大臣届出分+都道府県選管届出分 - breakdown of numbers including regional (summary only)
    政治資金規正法に基づく届出 - notices of group formation/dissolution etc
    assert href.endswith('.pdf')
    dd = node.getparent()
    assert dd.tag == 'dd'
    dl = dd.getparent()
    assert dl.tag == 'dl'
    head = dl.getprevious()
    assert head.tag == 'h5'
    while dd is not None and dd.tag == 'dd':
        dd = dd.getprevious()
    if dd is not None and dd.tag == 'dt':
        assert head.text.startswith(u'政治資金収支報告の概要')
        #docsettype = u'政治資金収支報告の概要'
        srctitle = dd.text[1:-1]
        assert head.text.startswith(u'政治資金規正法に基づく届出')
        srctitle = u'政治資金規正法に基づく届出'

    docsettype = u'報道資料'

    title = text + u' (報道資料)'
    ptype = 'summary'
    grouptype = None
    year = get_nenbun(text, weak=True)

    cache_pdf(session, href, base_url, SITE, ptype, title, srctitle, grouptype, year, docsettype)
Example #2
def kanpo(session, url, title, base_url):
    """ 官報 data - page of pdf links with hash-parts and a different format"""
    logging.debug('%s: %s, %s' % ('kanpo', title, url))
    data = cache_page(session, url, base_url)
    if data is None:
    urls, encoding, pagetitle = parse_page(data)
    urls = filter(summary_url_filter(url), urls)

    linkdata = {}

    for href, text, node in urls:
        if '#' in href:
            href, hashpart = href.split('#')
            assert hashpart.startswith('page=')
            pageno = int(hashpart[5:])
            pageno = 1
        link = u'%s\t%d' % (text, pageno)
        if href in linkdata:
            linkdata[href].append(('page', link))
            linkdata[href] = [('page', link)]

    ptype = 'kanpo'
    srctitle = u'政治資金収支報告書の要旨'
    grouptype = None
    year = get_nenbun(title)
    docsettype = u'官報'
    title = title + u' (官報)'

    for href, links in linkdata.iteritems():
        cache_pdf(session, href, base_url, SITE, ptype, title, srctitle, grouptype, year, docsettype, more_meta=links)
Example #3
def _pdf_list_page(session, url, site_base_url, ptype, title, srcurl, data, repurls, encoding, pagetitle, _grouptype=None, _year=None):
    """ Process a page which is just links to a bunch of PDFs """
    pdfurls = filter(lambda u: u[0].endswith('.pdf'), repurls)
    for purl, ptitle, node in pdfurls:
        if ptitle is None:
            # HACK: Hand code a couple of broken corner cases
            if purl.endswith('SD20110228/220210.pdf') or purl.endswith('SA20110228/200210.pdf'):
                ptitle = u'高健社'
                logging.warn('No title for %s..' % purl)
                ptitle = u'(無題)'

        # Find the group type and year for non teiki pages
        grouptype = _grouptype
        year = _year
        if grouptype is None:
            assert year is None
            previous = node.getprevious()
            while previous is not None and previous.tag != 'strong':
                previous = previous.getprevious()
            # Group types aren't always in kaisan..
            if previous is not None:
                grouptype = previous.text.strip('[]')
            previous = node.getparent()
            if previous is not None and previous.tag != 'div':
                previous = previous.getparent()
            while previous is not None and previous.tag != 'span':
                previous = previous.getprevious()
            if previous is None:
                # No pages like this yet
                import pdb
            year = get_nenbun(previous.text)

        cache_pdf(session, purl, url, site_base_url, ptype, title, ptitle, grouptype, year, pagetitle)
Example #4
def _page_with_children(session, url, title, ptype, base_url, data, repurls, encoding, pagetitle, _grouptype=None, _year=None):
    """ Process data that is split into sub-pages. """
    year = _year or get_nenbun(title)

    grouptypes = None
    grouptype_tb = None

    for suburl, linktitle, node in repurls:
        td = node.getparent()
        assert td.tag == 'td'
        tr = td.getparent()
        assert tr.tag == 'tr'
        tb = tr.getparent()
        assert tb.tag == 'table'

        if tb.attrib.get('id').startswith('list-item'):
            # embedded table..
            td = tb.getparent()
            assert td.tag == 'td'
            tr = td.getparent()
            assert tr.tag == 'tr'
            tb = tr.getparent()
            assert tb.tag == 'table'

        if grouptypes is None or tb != grouptype_tb:
            types = tb.xpath('./tr/th')
            grouptypes = [''.join(x.itertext()).strip() for x in types]
            grouptype_tb = tb

        colno = tr.xpath('./td').index(td)

        if not (colno < len(grouptypes) or _grouptype is not None):
            import pdb; pdb.set_trace()

        if colno < len(grouptypes):
            grouptype = grouptypes[colno]

            # Horrible hack method to check title, works on some pages..
            grouptype_offset = data.rfind('<!--', 0, data.index(suburl)) + 5
            grouptype_comment = data[grouptype_offset:grouptype_offset+20].decode(encoding)
            grouptype_b = grouptype_comment.split()[0]
            if grouptype_b == u'議員別' and grouptype.startswith(u'国会議員関係政治団体'):
                grouptype = grouptype_b
            elif grouptype_b.startswith('<'):
                # Nope, not what we were looking for.
                grouptype_b = None

            if not (not grouptype_b or grouptype_b == u'タイトル終了' or grouptype_b == grouptype):
                import pdb; pdb.set_trace()

            assert _grouptype is None or grouptype == _grouptype
            grouptype = _grouptype

        suburl = normalise(suburl, url)
        logging.debug('   %s, %s %s' % (suburl, linktitle, grouptype))
        if suburl.endswith('.pdf'):
            cache_pdf(session, suburl, url, base_url, ptype, title, linktitle, grouptype, year, pagetitle)
            combined_title = u'%s\t%s' % (title, linktitle)
            sub_ptype = ptype + 'sub'
            page_auto(session, suburl, base_url, sub_ptype, combined_title, url, data=None, grouptype=grouptype, year=year)