def summary_pdf_downloader(session, href, text, base_url, node): """ the summary page has a few different pdfs linked directly from it: 総務大臣届出分 - a breakdown of all the numbers 総務大臣届出分+都道府県選管届出分 - breakdown of numbers including regional (summary only) 政治資金規正法に基づく届出 - notices of group formation/dissolution etc """ assert href.endswith('.pdf') dd = node.getparent() assert dd.tag == 'dd' dl = dd.getparent() assert dl.tag == 'dl' head = dl.getprevious() assert head.tag == 'h5' while dd is not None and dd.tag == 'dd': dd = dd.getprevious() if dd is not None and dd.tag == 'dt': assert head.text.startswith(u'政治資金収支報告の概要') #docsettype = u'政治資金収支報告の概要' srctitle = dd.text[1:-1] else: assert head.text.startswith(u'政治資金規正法に基づく届出') srctitle = u'政治資金規正法に基づく届出' docsettype = u'報道資料' title = text + u' (報道資料)' ptype = 'summary' grouptype = None year = get_nenbun(text, weak=True) cache_pdf(session, href, base_url, SITE, ptype, title, srctitle, grouptype, year, docsettype)
def kanpo(session, url, title, base_url): """ 官報 data - page of pdf links with hash-parts and a different format""" logging.debug('%s: %s, %s' % ('kanpo', title, url)) data = cache_page(session, url, base_url) if data is None: return urls, encoding, pagetitle = parse_page(data) urls = filter(summary_url_filter(url), urls) linkdata = {} for href, text, node in urls: if '#' in href: href, hashpart = href.split('#') assert hashpart.startswith('page=') pageno = int(hashpart[5:]) else: pageno = 1 link = u'%s\t%d' % (text, pageno) if href in linkdata: linkdata[href].append(('page', link)) else: linkdata[href] = [('page', link)] ptype = 'kanpo' srctitle = u'政治資金収支報告書の要旨' grouptype = None year = get_nenbun(title) docsettype = u'官報' title = title + u' (官報)' for href, links in linkdata.iteritems(): cache_pdf(session, href, base_url, SITE, ptype, title, srctitle, grouptype, year, docsettype, more_meta=links)
def _pdf_list_page(session, url, site_base_url, ptype, title, srcurl, data, repurls, encoding, pagetitle, _grouptype=None, _year=None): """ Process a page which is just links to a bunch of PDFs """ pdfurls = filter(lambda u: u[0].endswith('.pdf'), repurls) for purl, ptitle, node in pdfurls: if ptitle is None: # HACK: Hand code a couple of broken corner cases if purl.endswith('SD20110228/220210.pdf') or purl.endswith('SA20110228/200210.pdf'): ptitle = u'高健社' else: logging.warn('No title for %s..' % purl) ptitle = u'(無題)' # Find the group type and year for non teiki pages grouptype = _grouptype year = _year if grouptype is None: assert year is None previous = node.getprevious() while previous is not None and previous.tag != 'strong': previous = previous.getprevious() # Group types aren't always in kaisan.. if previous is not None: grouptype = previous.text.strip('[]') previous = node.getparent() if previous is not None and previous.tag != 'div': previous = previous.getparent() while previous is not None and previous.tag != 'span': previous = previous.getprevious() if previous is None: # No pages like this yet import pdb pdb.set_trace() year = get_nenbun(previous.text) cache_pdf(session, purl, url, site_base_url, ptype, title, ptitle, grouptype, year, pagetitle)
def _page_with_children(session, url, title, ptype, base_url, data, repurls, encoding, pagetitle, _grouptype=None, _year=None): """ Process data that is split into sub-pages. """ year = _year or get_nenbun(title) grouptypes = None grouptype_tb = None for suburl, linktitle, node in repurls: td = node.getparent() assert td.tag == 'td' tr = td.getparent() assert tr.tag == 'tr' tb = tr.getparent() assert tb.tag == 'table' if tb.attrib.get('id').startswith('list-item'): # embedded table.. td = tb.getparent() assert td.tag == 'td' tr = td.getparent() assert tr.tag == 'tr' tb = tr.getparent() assert tb.tag == 'table' if grouptypes is None or tb != grouptype_tb: types = tb.xpath('./tr/th') grouptypes = [''.join(x.itertext()).strip() for x in types] grouptype_tb = tb colno = tr.xpath('./td').index(td) if not (colno < len(grouptypes) or _grouptype is not None): import pdb; pdb.set_trace() if colno < len(grouptypes): grouptype = grouptypes[colno] # Horrible hack method to check title, works on some pages.. grouptype_offset = data.rfind('<!--', 0, data.index(suburl)) + 5 grouptype_comment = data[grouptype_offset:grouptype_offset+20].decode(encoding) grouptype_b = grouptype_comment.split()[0] if grouptype_b == u'議員別' and grouptype.startswith(u'国会議員関係政治団体'): grouptype = grouptype_b elif grouptype_b.startswith('<'): # Nope, not what we were looking for. grouptype_b = None if not (not grouptype_b or grouptype_b == u'タイトル終了' or grouptype_b == grouptype): import pdb; pdb.set_trace() assert _grouptype is None or grouptype == _grouptype else: grouptype = _grouptype suburl = normalise(suburl, url) logging.debug(' %s, %s %s' % (suburl, linktitle, grouptype)) if suburl.endswith('.pdf'): cache_pdf(session, suburl, url, base_url, ptype, title, linktitle, grouptype, year, pagetitle) else: combined_title = u'%s\t%s' % (title, linktitle) sub_ptype = ptype + 'sub' page_auto(session, suburl, base_url, sub_ptype, combined_title, url, data=None, grouptype=grouptype, year=year)