def main(output='data.tsv'): """ Build data set from user annotation. Outputs data.tsv """ # create file, we'll be appending to it as we go along with file(output, 'wb') as f: f.write('') try: w = load('weights.pkl~') except IOError: print 'failed to load file' w = None pages = [] for meta, d, pdf in islice(data(), None): if find_authors(meta, d, pdf, output): gs(meta['cached'], outdir) pages.append(pdf.pages[0]) if w is not None: for x in pdf.pages[0].items: y = predict(w, {k: 1.0 for k in features(x)}) if y != 'other': x.style['border'] = '2px solid %s' % { 'author': 'green', 'title': 'blue' }[y] c = {'author': magenta, 'title': blue}[y] print '%s: %s' % (c % y, x.text) # if we want to draw the first pages of many pdfs on one html document we # have to lie to the items -- tell them they are on pages other than the # first... yoffset = 0 for p in pages: for item in p.items: if hasattr(item, 'yoffset'): item.yoffset += yoffset yoffset += p.height with file(outfile, 'wb') as f: template.render_context(Context(f, pages=pages)) import webbrowser webbrowser.open(outfile)
def markup_pdf(filename): """ Apply learned model on a pdf. Creates a image of the first page. """ try: w = load('weights.pkl~') except IOError: print 'failed to load file' w = None pages = [] filename = path(filename) pdf = pdfminer(filename) gs(filename, outdir) pages.append(pdf.pages[0]) if w is not None: for x in pdf.pages[0].items: y = predict(w, {k: 1.0 for k in features(x)}) if y != 'other': x.style['border'] = '2px solid %s' % { 'author': 'magenta', 'title': 'blue' }[y] c = {'author': magenta, 'title': blue}[y] print '%s: %s' % (c % y, x.text) # if we want to draw the first pages of many pdfs on one html document we # have to lie to the items -- tell them they are on pages other than the # first... yoffset = 0 for p in pages: for item in p.items: if hasattr(item, 'yoffset'): item.yoffset += yoffset yoffset += p.height with file(outfile, 'wb') as f: template.render_context(Context(f, pages=pages)) import webbrowser webbrowser.open(f.name)
def main(output='data.tsv'): """ Build data set from user annotation. Outputs data.tsv """ # create file, we'll be appending to it as we go along with file(output, 'wb') as f: f.write('') try: w = load('weights.pkl~') except IOError: print 'failed to load file' w = None pages = [] for meta, d, pdf in islice(data(), None): if find_authors(meta, d, pdf, output): gs(meta['cached'], outdir) pages.append(pdf.pages[0]) if w is not None: for x in pdf.pages[0].items: y = predict(w, {k: 1.0 for k in features(x)}) if y != 'other': x.style['border'] = '2px solid %s' % {'author': 'green', 'title': 'blue'}[y] c = {'author': magenta, 'title': blue}[y] print '%s: %s' % (c % y, x.text) # if we want to draw the first pages of many pdfs on one html document we # have to lie to the items -- tell them they are on pages other than the # first... yoffset = 0 for p in pages: for item in p.items: if hasattr(item, 'yoffset'): item.yoffset += yoffset yoffset += p.height with file(outfile, 'wb') as f: template.render_context(Context(f, pages=pages)) import webbrowser webbrowser.open(outfile)
def markup_pdf(filename): """ Apply learned model on a pdf. Creates a image of the first page. """ try: w = load('weights.pkl~') except IOError: print 'failed to load file' w = None pages = [] filename = path(filename) pdf = pdfminer(filename) gs(filename, outdir) pages.append(pdf.pages[0]) if w is not None: for x in pdf.pages[0].items: y = predict(w, {k: 1.0 for k in features(x)}) if y != 'other': x.style['border'] = '2px solid %s' % {'author': 'magenta', 'title': 'blue'}[y] c = {'author': magenta, 'title': blue}[y] print '%s: %s' % (c % y, x.text) # if we want to draw the first pages of many pdfs on one html document we # have to lie to the items -- tell them they are on pages other than the # first... yoffset = 0 for p in pages: for item in p.items: if hasattr(item, 'yoffset'): item.yoffset += yoffset yoffset += p.height with file(outfile, 'wb') as f: template.render_context(Context(f, pages=pages)) import webbrowser webbrowser.open(f.name)
def find_authors(meta, d, pdf, output): authors = [set(shingle(x.strip())) for x in meta['author']] author = ' ; '.join(meta['author']) title = meta['title'] T = set(shingle(title.strip())) if not pdf: return items = pdf.pages[0].items author_candidates = [] title_candidates = [] for x in items: if 'text' not in x.attributes: continue text = x.text text = re.sub(',', ' ', text) text = text.encode('utf8', 'ignore') # HACK: ignores non-ascii b = shingle(text) b = set(b) if not b: continue dist = -len(T & b) * 1.0 / len(T | b) if dist <= -0.1: title_candidates.append(((dist, -x.fontsize), x)) distance = sum(-len(a & b) * 1.0 / len(a | b) for a in authors) if distance > -0.2: continue author_candidates.append(((distance, -x.fontsize), x)) if not author_candidates or not title_candidates: print red % 'Sorry, no lines in the document :-(' return for x in items: x.attributes['label'] = 'other' for x in heuristic(title, title_candidates): x.attributes['label'] = 'title' x.style['background-color'] = 'rgba(0,0,255,0.2)' for x in heuristic(author, author_candidates): x.attributes['label'] = 'author' x.style['background-color'] = 'rgba(0,255,0,0.2)' # dump training data to file. with file(output, 'a') as f: for item in items: f.write(item.attributes['label']) f.write('\t') f.write('alwayson') f.write('\t') f.write('\t'.join(features(item))) f.write('\n') print return True