def main(output='data.tsv'): """ Build data set from user annotation. Outputs data.tsv """ # create file, we'll be appending to it as we go along with file(output, 'wb') as f: f.write('') try: w = load('weights.pkl~') except IOError: print 'failed to load file' w = None pages = [] for meta, d, pdf in islice(data(), None): if find_authors(meta, d, pdf, output): gs(meta['cached'], outdir) pages.append(pdf.pages[0]) if w is not None: for x in pdf.pages[0].items: y = predict(w, {k: 1.0 for k in features(x)}) if y != 'other': x.style['border'] = '2px solid %s' % { 'author': 'green', 'title': 'blue' }[y] c = {'author': magenta, 'title': blue}[y] print '%s: %s' % (c % y, x.text) # if we want to draw the first pages of many pdfs on one html document we # have to lie to the items -- tell them they are on pages other than the # first... yoffset = 0 for p in pages: for item in p.items: if hasattr(item, 'yoffset'): item.yoffset += yoffset yoffset += p.height with file(outfile, 'wb') as f: template.render_context(Context(f, pages=pages)) import webbrowser webbrowser.open(outfile)
def markup_pdf(filename): """ Apply learned model on a pdf. Creates a image of the first page. """ try: w = load('weights.pkl~') except IOError: print 'failed to load file' w = None pages = [] filename = path(filename) pdf = pdfminer(filename) gs(filename, outdir) pages.append(pdf.pages[0]) if w is not None: for x in pdf.pages[0].items: y = predict(w, {k: 1.0 for k in features(x)}) if y != 'other': x.style['border'] = '2px solid %s' % { 'author': 'magenta', 'title': 'blue' }[y] c = {'author': magenta, 'title': blue}[y] print '%s: %s' % (c % y, x.text) # if we want to draw the first pages of many pdfs on one html document we # have to lie to the items -- tell them they are on pages other than the # first... yoffset = 0 for p in pages: for item in p.items: if hasattr(item, 'yoffset'): item.yoffset += yoffset yoffset += p.height with file(outfile, 'wb') as f: template.render_context(Context(f, pages=pages)) import webbrowser webbrowser.open(f.name)
def main(output='data.tsv'): """ Build data set from user annotation. Outputs data.tsv """ # create file, we'll be appending to it as we go along with file(output, 'wb') as f: f.write('') try: w = load('weights.pkl~') except IOError: print 'failed to load file' w = None pages = [] for meta, d, pdf in islice(data(), None): if find_authors(meta, d, pdf, output): gs(meta['cached'], outdir) pages.append(pdf.pages[0]) if w is not None: for x in pdf.pages[0].items: y = predict(w, {k: 1.0 for k in features(x)}) if y != 'other': x.style['border'] = '2px solid %s' % {'author': 'green', 'title': 'blue'}[y] c = {'author': magenta, 'title': blue}[y] print '%s: %s' % (c % y, x.text) # if we want to draw the first pages of many pdfs on one html document we # have to lie to the items -- tell them they are on pages other than the # first... yoffset = 0 for p in pages: for item in p.items: if hasattr(item, 'yoffset'): item.yoffset += yoffset yoffset += p.height with file(outfile, 'wb') as f: template.render_context(Context(f, pages=pages)) import webbrowser webbrowser.open(outfile)
def markup_pdf(filename): """ Apply learned model on a pdf. Creates a image of the first page. """ try: w = load('weights.pkl~') except IOError: print 'failed to load file' w = None pages = [] filename = path(filename) pdf = pdfminer(filename) gs(filename, outdir) pages.append(pdf.pages[0]) if w is not None: for x in pdf.pages[0].items: y = predict(w, {k: 1.0 for k in features(x)}) if y != 'other': x.style['border'] = '2px solid %s' % {'author': 'magenta', 'title': 'blue'}[y] c = {'author': magenta, 'title': blue}[y] print '%s: %s' % (c % y, x.text) # if we want to draw the first pages of many pdfs on one html document we # have to lie to the items -- tell them they are on pages other than the # first... yoffset = 0 for p in pages: for item in p.items: if hasattr(item, 'yoffset'): item.yoffset += yoffset yoffset += p.height with file(outfile, 'wb') as f: template.render_context(Context(f, pages=pages)) import webbrowser webbrowser.open(f.name)