Ejemplo n.º 1
0
def cache_pdf(session, url, srcurl, site_base_url, ptype, title, srctitle, grouptype, year, docsettype, more_meta=None):
    """
    Get a PDF with caching.
    more_meta is an optional list of additional metadata k,v pairs
    """
    url = normalise(url, srcurl)
    if url in _seen:
        logging.warn("ALREADY SEEN PDF %s" % url)
        return

    pdf_cache_file = url[len(site_base_url):]
    if pdf_cache_file.startswith('/'):
        pdf_cache_file = pdf_cache_file[1:]
    pdf_cache_file = os.path.join(pdf_cache_dir, pdf_cache_file)
    pdf_dir = os.path.dirname(pdf_cache_file)
    if not os.path.isdir(pdf_dir):
        os.makedirs(pdf_dir)
    meta_file = metautil.meta_path(pdf_cache_file)

    if (not _redo_meta) and os.path.exists(pdf_cache_file) and os.path.exists(meta_file):
        logging.debug("already have pdf and meta for %s" % url)
        return

    try:
        meta = None
        if not _redo_meta:
            save(session, url, pdf_cache_file)
        if _redo_meta and os.path.exists(meta_file):
            # For debugging..
            # oldmeta = open(meta_file, 'rb').read()
            pass

        meta = open(meta_file, 'wb')
        meta.write('url,%s\n' % url)
        meta.write('srcurl,%s\n' % srcurl)
        meta.write('title,%s\n' % title.encode('utf-8'))
        meta.write('srctitle,%s\n' % srctitle.encode('utf-8'))
        meta.write('pagetype,%s\n' % ptype.encode('utf-8'))
        meta.write('grouptype,%s\n' % grouptype.encode('utf-8') if grouptype else '')
        meta.write('docsettype,%s\n' % docsettype.encode('utf-8'))
        meta.write('year,%s\n' % year.encode('utf-8'))
        meta.write('fetched,%s\n' % str(datetime.datetime.now()))
        if more_meta is not None:
            for k, v in more_meta:
                meta.write('%s,%s\n' % (k, v.encode('utf-8')))

        meta.close()
    except:
        if meta:
            meta.close()
            os.unlink(meta_file)
        raise
Ejemplo n.º 2
0
def check_pdf(pdf_dir, pdf_file):
    pdfpath = os.path.join(pdf_dir, pdf_file)
    metapath = metautil.meta_path(pdfpath)
    meta = metautil.get_meta(pdfpath)

    if meta is None:
        logging.warn('skip %s with no metadata' % pdfpath)
        return

    if meta.get('pagetype') == 'kanpo':
        # No problem, just not the type of doc we process
        return

    if meta.get('pagetype') != 'summary':
        logging.warn('skip %s with page type %s (expect summary)' % (pdfpath, meta.get('pagetype')))
        return

    if meta.get('srctitle') != u'政治資金規正法に基づく届出' or meta.get('docsettype') != u'報道資料':
        # No problem, just not the type of doc we process, or maybe already processed
        return

    pdftext = extract_pdf_text(pdfpath).decode('utf-8')

    # Some docs are really borken :(
    if pdf_file in HARDCODED_DOCS:
        gname = HARDCODED_DOCS[pdf_file]['gname']
        doctype = HARDCODED_DOCS[pdf_file]['doctype']
    elif '(cid:' in pdftext:
        logging.warn('%s contains unknown characters' % pdf_file)
        return
    else:
        lines = pdftext.splitlines()
        lines = [x.strip() for x in lines]
        gname = group_name(lines, pdf_file)
        if gname is None:
            logging.info('Couldn\'t decide a group for %s' % pdf_file)
            return
        doctype = todoke_type(lines, pdf_file)
        if doctype is None:
            logging.info('Couldn\'t decide a doctype for %s' % pdf_file)
            return
    assert gname is not None and doctype is not None
    update_meta(metapath, meta, gname, doctype)