コード例 #1
0
def main(output='data.tsv'):
    """
    Build data set from user annotation.

    Outputs data.tsv

    """

    # create file, we'll be appending to it as we go along
    with file(output, 'wb') as f:
        f.write('')

    try:
        w = load('weights.pkl~')
    except IOError:
        print 'failed to load file'
        w = None

    pages = []
    for meta, d, pdf in islice(data(), None):
        if find_authors(meta, d, pdf, output):
            gs(meta['cached'], outdir)
            pages.append(pdf.pages[0])

            if w is not None:
                for x in pdf.pages[0].items:
                    y = predict(w, {k: 1.0 for k in features(x)})
                    if y != 'other':
                        x.style['border'] = '2px solid %s' % {
                            'author': 'green',
                            'title': 'blue'
                        }[y]
                        c = {'author': magenta, 'title': blue}[y]
                        print '%s: %s' % (c % y, x.text)

    # if we want to draw the first pages of many pdfs on one html document we
    # have to lie to the items -- tell them they are on pages other than the
    # first...
    yoffset = 0
    for p in pages:
        for item in p.items:
            if hasattr(item, 'yoffset'):
                item.yoffset += yoffset
        yoffset += p.height

    with file(outfile, 'wb') as f:
        template.render_context(Context(f, pages=pages))

    import webbrowser
    webbrowser.open(outfile)
コード例 #2
0
def markup_pdf(filename):
    """
    Apply learned model on a pdf.

    Creates a image of the first page.
    """

    try:
        w = load('weights.pkl~')
    except IOError:
        print 'failed to load file'
        w = None

    pages = []

    filename = path(filename)

    pdf = pdfminer(filename)

    gs(filename, outdir)
    pages.append(pdf.pages[0])

    if w is not None:
        for x in pdf.pages[0].items:
            y = predict(w, {k: 1.0 for k in features(x)})
            if y != 'other':
                x.style['border'] = '2px solid %s' % {
                    'author': 'magenta',
                    'title': 'blue'
                }[y]
                c = {'author': magenta, 'title': blue}[y]
                print '%s: %s' % (c % y, x.text)

    # if we want to draw the first pages of many pdfs on one html document we
    # have to lie to the items -- tell them they are on pages other than the
    # first...
    yoffset = 0
    for p in pages:
        for item in p.items:
            if hasattr(item, 'yoffset'):
                item.yoffset += yoffset
        yoffset += p.height

    with file(outfile, 'wb') as f:
        template.render_context(Context(f, pages=pages))

    import webbrowser
    webbrowser.open(f.name)
コード例 #3
0
def main(output='data.tsv'):
    """
    Build data set from user annotation.

    Outputs data.tsv

    """

    # create file, we'll be appending to it as we go along
    with file(output, 'wb') as f:
        f.write('')

    try:
        w = load('weights.pkl~')
    except IOError:
        print 'failed to load file'
        w = None

    pages = []
    for meta, d, pdf in islice(data(), None):
        if find_authors(meta, d, pdf, output):
            gs(meta['cached'], outdir)
            pages.append(pdf.pages[0])

            if w is not None:
                for x in pdf.pages[0].items:
                    y = predict(w, {k: 1.0 for k in features(x)})
                    if y != 'other':
                        x.style['border'] = '2px solid %s' % {'author': 'green', 'title': 'blue'}[y]
                        c = {'author': magenta, 'title': blue}[y]
                        print '%s: %s' % (c % y, x.text)

    # if we want to draw the first pages of many pdfs on one html document we
    # have to lie to the items -- tell them they are on pages other than the
    # first...
    yoffset = 0
    for p in pages:
        for item in p.items:
            if hasattr(item, 'yoffset'):
                item.yoffset += yoffset
        yoffset += p.height

    with file(outfile, 'wb') as f:
        template.render_context(Context(f, pages=pages))

    import webbrowser
    webbrowser.open(outfile)
コード例 #4
0
def markup_pdf(filename):
    """
    Apply learned model on a pdf.

    Creates a image of the first page.
    """

    try:
        w = load('weights.pkl~')
    except IOError:
        print 'failed to load file'
        w = None

    pages = []

    filename = path(filename)

    pdf = pdfminer(filename)

    gs(filename, outdir)
    pages.append(pdf.pages[0])

    if w is not None:
        for x in pdf.pages[0].items:
            y = predict(w, {k: 1.0 for k in features(x)})
            if y != 'other':
                x.style['border'] = '2px solid %s' % {'author': 'magenta', 'title': 'blue'}[y]
                c = {'author': magenta, 'title': blue}[y]
                print '%s: %s' % (c % y, x.text)

    # if we want to draw the first pages of many pdfs on one html document we
    # have to lie to the items -- tell them they are on pages other than the
    # first...
    yoffset = 0
    for p in pages:
        for item in p.items:
            if hasattr(item, 'yoffset'):
                item.yoffset += yoffset
        yoffset += p.height

    with file(outfile, 'wb') as f:
        template.render_context(Context(f, pages=pages))

    import webbrowser
    webbrowser.open(f.name)
コード例 #5
0
def find_authors(meta, d, pdf, output):

    authors = [set(shingle(x.strip())) for x in meta['author']]
    author = ' ; '.join(meta['author'])

    title = meta['title']
    T = set(shingle(title.strip()))

    if not pdf:
        return

    items = pdf.pages[0].items

    author_candidates = []
    title_candidates = []

    for x in items:
        if 'text' not in x.attributes:
            continue

        text = x.text
        text = re.sub(',', ' ', text)
        text = text.encode('utf8', 'ignore')  # HACK: ignores non-ascii

        b = shingle(text)
        b = set(b)

        if not b:
            continue

        dist = -len(T & b) * 1.0 / len(T | b)

        if dist <= -0.1:
            title_candidates.append(((dist, -x.fontsize), x))

        distance = sum(-len(a & b) * 1.0 / len(a | b) for a in authors)

        if distance > -0.2:
            continue

        author_candidates.append(((distance, -x.fontsize), x))

    if not author_candidates or not title_candidates:
        print red % 'Sorry, no lines in the document :-('
        return

    for x in items:
        x.attributes['label'] = 'other'

    for x in heuristic(title, title_candidates):
        x.attributes['label'] = 'title'
        x.style['background-color'] = 'rgba(0,0,255,0.2)'

    for x in heuristic(author, author_candidates):
        x.attributes['label'] = 'author'
        x.style['background-color'] = 'rgba(0,255,0,0.2)'

    # dump training data to file.
    with file(output, 'a') as f:
        for item in items:
            f.write(item.attributes['label'])
            f.write('\t')
            f.write('alwayson')
            f.write('\t')
            f.write('\t'.join(features(item)))
            f.write('\n')

    print

    return True
コード例 #6
0
def find_authors(meta, d, pdf, output):

    authors = [set(shingle(x.strip())) for x in meta['author']]
    author = ' ; '.join(meta['author'])

    title = meta['title']
    T = set(shingle(title.strip()))

    if not pdf:
        return

    items = pdf.pages[0].items

    author_candidates = []
    title_candidates = []

    for x in items:
        if 'text' not in x.attributes:
            continue

        text = x.text
        text = re.sub(',', ' ', text)
        text = text.encode('utf8', 'ignore')  # HACK: ignores non-ascii

        b = shingle(text)
        b = set(b)

        if not b:
            continue

        dist = -len(T & b) * 1.0 / len(T | b)

        if dist <= -0.1:
            title_candidates.append(((dist, -x.fontsize), x))

        distance = sum(-len(a & b) * 1.0 / len(a | b) for a in authors)

        if distance > -0.2:
            continue

        author_candidates.append(((distance, -x.fontsize), x))

    if not author_candidates or not title_candidates:
        print red % 'Sorry, no lines in the document :-('
        return

    for x in items:
        x.attributes['label'] = 'other'

    for x in heuristic(title, title_candidates):
        x.attributes['label'] = 'title'
        x.style['background-color'] = 'rgba(0,0,255,0.2)'

    for x in heuristic(author, author_candidates):
        x.attributes['label'] = 'author'
        x.style['background-color'] = 'rgba(0,255,0,0.2)'

    # dump training data to file.
    with file(output, 'a') as f:
        for item in items:
            f.write(item.attributes['label'])
            f.write('\t')
            f.write('alwayson')
            f.write('\t')
            f.write('\t'.join(features(item)))
            f.write('\n')

    print

    return True