Example #1
0
def heuristic(target, candidates):
    """
    Applies string overlap with ``target`` heuristic to ``candidates``.

    Used to winnow the collection of candidates.
    """

    extracted = []
    candidates.sort()

    # font name and size of top hit
    fontname = candidates[0][1].fontname
    fontsize = candidates[0][1].fontsize

    print
    print 'Candidates:'
    for (distance, _), item in candidates[:10]:  # most similar lines
        x = item.attributes
        print '  %6.4f' % distance,

        text = item.text.encode('utf8')
        info = x.copy()
        info.pop('text')

        if item.fontname == fontname and item.fontsize == fontsize:
            print green % text, info
            extracted.append(item)
        else:
            print red % text, info

    if not extracted:
        print red % 'failed to extract anything relevant :-('
        return

    extracted_text = ' '.join(x.text for x in extracted).encode('utf8')

    size = 3
    c = set(shingle(extracted_text, n=size))

    tally = [0]*len(target)
    for i in xrange(len(target)):
        if target[i:i+size] in c:
            for j in xrange(i, i+size):
                tally[j] += 1

    print ''.join(color(c, 1 - x*1.0/3) for c, x in zip(target, tally))

    return extracted
Example #2
0
def heuristic(target, candidates):
    """
    Applies string overlap with ``target`` heuristic to ``candidates``.

    Used to winnow the collection of candidates.
    """

    extracted = []
    candidates.sort()

    # font name and size of top hit
    fontname = candidates[0][1].fontname
    fontsize = candidates[0][1].fontsize

    print
    print 'Candidates:'
    for (distance, _), item in candidates[:10]:  # most similar lines
        x = item.attributes
        print '  %6.4f' % distance,

        text = item.text.encode('utf8')
        info = x.copy()
        info.pop('text')

        if item.fontname == fontname and item.fontsize == fontsize:
            print green % text, info
            extracted.append(item)
        else:
            print red % text, info

    if not extracted:
        print red % 'failed to extract anything relevant :-('
        return

    extracted_text = ' '.join(x.text for x in extracted).encode('utf8')

    size = 3
    c = set(shingle(extracted_text, n=size))

    tally = [0] * len(target)
    for i in xrange(len(target)):
        if target[i:i + size] in c:
            for j in xrange(i, i + size):
                tally[j] += 1

    print ''.join(color(c, 1 - x * 1.0 / 3) for c, x in zip(target, tally))

    return extracted
Example #3
0
def find_authors(meta, d, pdf, output):

    authors = [set(shingle(x.strip())) for x in meta['author']]
    author = ' ; '.join(meta['author'])

    title = meta['title']
    T = set(shingle(title.strip()))

    if not pdf:
        return

    items = pdf.pages[0].items

    author_candidates = []
    title_candidates = []

    for x in items:
        if 'text' not in x.attributes:
            continue

        text = x.text
        text = re.sub(',', ' ', text)
        text = text.encode('utf8', 'ignore')  # HACK: ignores non-ascii

        b = shingle(text)
        b = set(b)

        if not b:
            continue

        dist = -len(T & b) * 1.0 / len(T | b)

        if dist <= -0.1:
            title_candidates.append(((dist, -x.fontsize), x))

        distance = sum(-len(a & b) * 1.0 / len(a | b) for a in authors)

        if distance > -0.2:
            continue

        author_candidates.append(((distance, -x.fontsize), x))

    if not author_candidates or not title_candidates:
        print red % 'Sorry, no lines in the document :-('
        return

    for x in items:
        x.attributes['label'] = 'other'

    for x in heuristic(title, title_candidates):
        x.attributes['label'] = 'title'
        x.style['background-color'] = 'rgba(0,0,255,0.2)'

    for x in heuristic(author, author_candidates):
        x.attributes['label'] = 'author'
        x.style['background-color'] = 'rgba(0,255,0,0.2)'

    # dump training data to file.
    with file(output, 'a') as f:
        for item in items:
            f.write(item.attributes['label'])
            f.write('\t')
            f.write('alwayson')
            f.write('\t')
            f.write('\t'.join(features(item)))
            f.write('\n')

    print

    return True
Example #4
0
def find_authors(meta, d, pdf, output):

    authors = [set(shingle(x.strip())) for x in meta['author']]
    author = ' ; '.join(meta['author'])

    title = meta['title']
    T = set(shingle(title.strip()))

    if not pdf:
        return

    items = pdf.pages[0].items

    author_candidates = []
    title_candidates = []

    for x in items:
        if 'text' not in x.attributes:
            continue

        text = x.text
        text = re.sub(',', ' ', text)
        text = text.encode('utf8', 'ignore')  # HACK: ignores non-ascii

        b = shingle(text)
        b = set(b)

        if not b:
            continue

        dist = -len(T & b) * 1.0 / len(T | b)

        if dist <= -0.1:
            title_candidates.append(((dist, -x.fontsize), x))

        distance = sum(-len(a & b) * 1.0 / len(a | b) for a in authors)

        if distance > -0.2:
            continue

        author_candidates.append(((distance, -x.fontsize), x))

    if not author_candidates or not title_candidates:
        print red % 'Sorry, no lines in the document :-('
        return

    for x in items:
        x.attributes['label'] = 'other'

    for x in heuristic(title, title_candidates):
        x.attributes['label'] = 'title'
        x.style['background-color'] = 'rgba(0,0,255,0.2)'

    for x in heuristic(author, author_candidates):
        x.attributes['label'] = 'author'
        x.style['background-color'] = 'rgba(0,255,0,0.2)'

    # dump training data to file.
    with file(output, 'a') as f:
        for item in items:
            f.write(item.attributes['label'])
            f.write('\t')
            f.write('alwayson')
            f.write('\t')
            f.write('\t'.join(features(item)))
            f.write('\n')

    print

    return True