def heuristic(target, candidates): """ Applies string overlap with ``target`` heuristic to ``candidates``. Used to winnow the collection of candidates. """ extracted = [] candidates.sort() # font name and size of top hit fontname = candidates[0][1].fontname fontsize = candidates[0][1].fontsize print print 'Candidates:' for (distance, _), item in candidates[:10]: # most similar lines x = item.attributes print ' %6.4f' % distance, text = item.text.encode('utf8') info = x.copy() info.pop('text') if item.fontname == fontname and item.fontsize == fontsize: print green % text, info extracted.append(item) else: print red % text, info if not extracted: print red % 'failed to extract anything relevant :-(' return extracted_text = ' '.join(x.text for x in extracted).encode('utf8') size = 3 c = set(shingle(extracted_text, n=size)) tally = [0]*len(target) for i in xrange(len(target)): if target[i:i+size] in c: for j in xrange(i, i+size): tally[j] += 1 print ''.join(color(c, 1 - x*1.0/3) for c, x in zip(target, tally)) return extracted
def heuristic(target, candidates): """ Applies string overlap with ``target`` heuristic to ``candidates``. Used to winnow the collection of candidates. """ extracted = [] candidates.sort() # font name and size of top hit fontname = candidates[0][1].fontname fontsize = candidates[0][1].fontsize print print 'Candidates:' for (distance, _), item in candidates[:10]: # most similar lines x = item.attributes print ' %6.4f' % distance, text = item.text.encode('utf8') info = x.copy() info.pop('text') if item.fontname == fontname and item.fontsize == fontsize: print green % text, info extracted.append(item) else: print red % text, info if not extracted: print red % 'failed to extract anything relevant :-(' return extracted_text = ' '.join(x.text for x in extracted).encode('utf8') size = 3 c = set(shingle(extracted_text, n=size)) tally = [0] * len(target) for i in xrange(len(target)): if target[i:i + size] in c: for j in xrange(i, i + size): tally[j] += 1 print ''.join(color(c, 1 - x * 1.0 / 3) for c, x in zip(target, tally)) return extracted
def find_authors(meta, d, pdf, output): authors = [set(shingle(x.strip())) for x in meta['author']] author = ' ; '.join(meta['author']) title = meta['title'] T = set(shingle(title.strip())) if not pdf: return items = pdf.pages[0].items author_candidates = [] title_candidates = [] for x in items: if 'text' not in x.attributes: continue text = x.text text = re.sub(',', ' ', text) text = text.encode('utf8', 'ignore') # HACK: ignores non-ascii b = shingle(text) b = set(b) if not b: continue dist = -len(T & b) * 1.0 / len(T | b) if dist <= -0.1: title_candidates.append(((dist, -x.fontsize), x)) distance = sum(-len(a & b) * 1.0 / len(a | b) for a in authors) if distance > -0.2: continue author_candidates.append(((distance, -x.fontsize), x)) if not author_candidates or not title_candidates: print red % 'Sorry, no lines in the document :-(' return for x in items: x.attributes['label'] = 'other' for x in heuristic(title, title_candidates): x.attributes['label'] = 'title' x.style['background-color'] = 'rgba(0,0,255,0.2)' for x in heuristic(author, author_candidates): x.attributes['label'] = 'author' x.style['background-color'] = 'rgba(0,255,0,0.2)' # dump training data to file. with file(output, 'a') as f: for item in items: f.write(item.attributes['label']) f.write('\t') f.write('alwayson') f.write('\t') f.write('\t'.join(features(item))) f.write('\n') print return True