def pdfrefs(in_fp): """Extract references from a pdf file""" xml = pdf2xml(in_fp) ref_re = re.compile('\[(\d+)\] (.*)') in_ref = False refs = dict() key = None for t in xml.iter('text'): if not t.text: continue if 'references' in t.text.lower(): in_ref = True continue if not in_ref: continue ref_m = ref_re.match(t.text) if ref_m: key, text = int(ref_m.group(1)), ref_m.group(2) refs[key] = [text] else: refs[key].append(t.text) refs = {k: ' '.join(v) for k, v in refs.items()} return refs
def pdftitle(in_fp): xml = pdf2xml(in_fp) fontspecs = xml.findall("./page[@number='1']/fontspec") max_size = -1 max_i = None for f in fontspecs: i = int(f.attrib['id']) size = int(f.attrib['size']) if size >= max_size: max_size = size max_i = i titles = xml.xpath(".//text[@font='{0}']/descendant-or-self::text()".format(max_i)) title = ' '.join(titles).strip() title = re.sub('\s+', ' ', title) return title