def check_pdf(pdf_dir, pdf_file): pdfpath = os.path.join(pdf_dir, pdf_file) metapath = metautil.meta_path(pdfpath) meta = metautil.get_meta(pdfpath) if meta is None: logging.warn('skip %s with no metadata' % pdfpath) return if meta.get('pagetype') == 'kanpo': # No problem, just not the type of doc we process return if meta.get('pagetype') != 'summary': logging.warn('skip %s with page type %s (expect summary)' % (pdfpath, meta.get('pagetype'))) return if meta.get('srctitle') != u'政治資金規正法に基づく届出' or meta.get('docsettype') != u'報道資料': # No problem, just not the type of doc we process, or maybe already processed return pdftext = extract_pdf_text(pdfpath).decode('utf-8') # Some docs are really borken :( if pdf_file in HARDCODED_DOCS: gname = HARDCODED_DOCS[pdf_file]['gname'] doctype = HARDCODED_DOCS[pdf_file]['doctype'] elif '(cid:' in pdftext: logging.warn('%s contains unknown characters' % pdf_file) return else: lines = pdftext.splitlines() lines = [x.strip() for x in lines] gname = group_name(lines, pdf_file) if gname is None: logging.info('Couldn\'t decide a group for %s' % pdf_file) return doctype = todoke_type(lines, pdf_file) if doctype is None: logging.info('Couldn\'t decide a doctype for %s' % pdf_file) return assert gname is not None and doctype is not None update_meta(metapath, meta, gname, doctype)
def check_pdf(s, pdf_path, pdf_root, api_root, docs_by_url, nodefer, groupsonly): relative_path = pdf_path[len(pdf_root):] meta = metautil.get_meta(pdf_path) if meta is None: logging.warn("Skip %s which has no metadata!" % relative_path) return if 'title' not in meta or 'url' not in meta or 'srcurl' not in meta: logging.warn("Invalid metadata for %s!" % relative_path) return url = meta['url'] if url in docs_by_url: # Verify the contents.. should be the same source record = docs_by_url[url] if record['srcurl'] != meta['srcurl']: logging.warn('Difference sources for %s at %s and in db: %s vs %s' % (url, relative_path, record['srcurl'], meta['srcurl'])) return else: gname = meta['srctitle'] note = None notepart = MULTIPART_RE.search(gname) # logging.info(u"Group %s notepart %s." % (gname, notepart)) if notepart is not None: notepart = notepart.groups() gname = notepart[0] note = notepart[1] if note: note = note.strip() gname = gname.strip() if 'grouptype' not in meta and gname not in _group_cache: if nodefer: logging.info(u"Recording %s as unknown." % (gname,)) meta['grouptype'] = u'不明' else: logging.info(u"Defer %s (%s) to get more group data" % (relative_path, gname)) return gtype = meta.get('grouptype') # Sometimes this name has a の.. if gtype == u'政党の本部': gtype = u'政党本部' if gtype == u'政党の支部': gtype = u'政党支部' if gtype == u'総括文書(支部分)': gtype = u'政党支部' if gtype == u'資金管理団体(国会議員関係政治団体を除く。)': gtype = u'資金管理団体' if gtype == u'国会議員関係政治団体(政党の支部を除く。)': gtype = u'国会議員関係政治団体' if gtype == u'政党': # This could be honbu or shibu gtype = None title = meta['title'] title_parts = title.split('\t') parent = None if len(title_parts) == 2: parent = title_parts[1].strip() if len(parent) <= 1: parent = None group = get_or_make_group(s, api_root, gname, gtype, parent) if group is None: # Something went wrong.. unknown type? return docdir, docfname = os.path.split(relative_path) docset = get_or_make_docset(s, api_root, title, meta['docsettype'], docdir) if groupsonly: return # Collect pdf stats - size and pages fsize = os.stat(pdf_path).st_size pagesre = re.compile('Pages:\s+(\d+)') p1 = subprocess.Popen(['pdfinfo', pdf_path], stdout=subprocess.PIPE) (stdoutdata, stderrdata) = p1.communicate() pagecount = 0 m = pagesre.search(stdoutdata) if not m: import pdb; pdb.set_trace() else: pagecount = int(m.groups()[0]) # finally.. make the doc. document = make_doc(s, api_root, docset['id'], meta['year'], group['id'], docfname, meta['url'], meta['srcurl'], fsize, pagecount, note)