Example #1
0
def main(output='data.tsv'):
    """
    Build data set from user annotation.

    Outputs data.tsv

    """

    # create file, we'll be appending to it as we go along
    with file(output, 'wb') as f:
        f.write('')

    try:
        w = load('weights.pkl~')
    except IOError:
        print 'failed to load file'
        w = None

    pages = []
    for meta, d, pdf in islice(data(), None):
        if find_authors(meta, d, pdf, output):
            gs(meta['cached'], outdir)
            pages.append(pdf.pages[0])

            if w is not None:
                for x in pdf.pages[0].items:
                    y = predict(w, {k: 1.0 for k in features(x)})
                    if y != 'other':
                        x.style['border'] = '2px solid %s' % {
                            'author': 'green',
                            'title': 'blue'
                        }[y]
                        c = {'author': magenta, 'title': blue}[y]
                        print '%s: %s' % (c % y, x.text)

    # if we want to draw the first pages of many pdfs on one html document we
    # have to lie to the items -- tell them they are on pages other than the
    # first...
    yoffset = 0
    for p in pages:
        for item in p.items:
            if hasattr(item, 'yoffset'):
                item.yoffset += yoffset
        yoffset += p.height

    with file(outfile, 'wb') as f:
        template.render_context(Context(f, pages=pages))

    import webbrowser
    webbrowser.open(outfile)
Example #2
0
def markup_pdf(filename):
    """
    Apply learned model on a pdf.

    Creates a image of the first page.
    """

    try:
        w = load('weights.pkl~')
    except IOError:
        print 'failed to load file'
        w = None

    pages = []

    filename = path(filename)

    pdf = pdfminer(filename)

    gs(filename, outdir)
    pages.append(pdf.pages[0])

    if w is not None:
        for x in pdf.pages[0].items:
            y = predict(w, {k: 1.0 for k in features(x)})
            if y != 'other':
                x.style['border'] = '2px solid %s' % {
                    'author': 'magenta',
                    'title': 'blue'
                }[y]
                c = {'author': magenta, 'title': blue}[y]
                print '%s: %s' % (c % y, x.text)

    # if we want to draw the first pages of many pdfs on one html document we
    # have to lie to the items -- tell them they are on pages other than the
    # first...
    yoffset = 0
    for p in pages:
        for item in p.items:
            if hasattr(item, 'yoffset'):
                item.yoffset += yoffset
        yoffset += p.height

    with file(outfile, 'wb') as f:
        template.render_context(Context(f, pages=pages))

    import webbrowser
    webbrowser.open(f.name)
Example #3
0
def main(output='data.tsv'):
    """
    Build data set from user annotation.

    Outputs data.tsv

    """

    # create file, we'll be appending to it as we go along
    with file(output, 'wb') as f:
        f.write('')

    try:
        w = load('weights.pkl~')
    except IOError:
        print 'failed to load file'
        w = None

    pages = []
    for meta, d, pdf in islice(data(), None):
        if find_authors(meta, d, pdf, output):
            gs(meta['cached'], outdir)
            pages.append(pdf.pages[0])

            if w is not None:
                for x in pdf.pages[0].items:
                    y = predict(w, {k: 1.0 for k in features(x)})
                    if y != 'other':
                        x.style['border'] = '2px solid %s' % {'author': 'green', 'title': 'blue'}[y]
                        c = {'author': magenta, 'title': blue}[y]
                        print '%s: %s' % (c % y, x.text)

    # if we want to draw the first pages of many pdfs on one html document we
    # have to lie to the items -- tell them they are on pages other than the
    # first...
    yoffset = 0
    for p in pages:
        for item in p.items:
            if hasattr(item, 'yoffset'):
                item.yoffset += yoffset
        yoffset += p.height

    with file(outfile, 'wb') as f:
        template.render_context(Context(f, pages=pages))

    import webbrowser
    webbrowser.open(outfile)
Example #4
0
def markup_pdf(filename):
    """
    Apply learned model on a pdf.

    Creates a image of the first page.
    """

    try:
        w = load('weights.pkl~')
    except IOError:
        print 'failed to load file'
        w = None

    pages = []

    filename = path(filename)

    pdf = pdfminer(filename)

    gs(filename, outdir)
    pages.append(pdf.pages[0])

    if w is not None:
        for x in pdf.pages[0].items:
            y = predict(w, {k: 1.0 for k in features(x)})
            if y != 'other':
                x.style['border'] = '2px solid %s' % {'author': 'magenta', 'title': 'blue'}[y]
                c = {'author': magenta, 'title': blue}[y]
                print '%s: %s' % (c % y, x.text)

    # if we want to draw the first pages of many pdfs on one html document we
    # have to lie to the items -- tell them they are on pages other than the
    # first...
    yoffset = 0
    for p in pages:
        for item in p.items:
            if hasattr(item, 'yoffset'):
                item.yoffset += yoffset
        yoffset += p.height

    with file(outfile, 'wb') as f:
        template.render_context(Context(f, pages=pages))

    import webbrowser
    webbrowser.open(f.name)