def update(): "Update index." # create index if it doesn't exist if not DIRECTORY.exists(): create() # get handle to Whoosh index ix = open_dir(DIRECTORY, NAME) with ix.writer() as w, ix.searcher() as searcher: # sort cached files by mtime. files = [Document(f) for f in CACHE.files()] files.sort(key = (lambda x: x.modified), reverse=True) for d in files: # lookup document mtime in the index; don't add or extract info if # you don't need it. result = searcher.find('cached', str(d.cached)) if not result: print('[INFO] new document', d.cached) else: assert len(result) == 1, 'cached should be unique.' result = result[0] if d.modified <= result['mtime']: # already up to date # Since we've sorted files by mtime, we know that files # after this one are older, and thus we're done. return print('[INFO] update to existing document:', d.cached) meta = d.parse_notes() # just a lint check assert meta['cached'] == d.cached, \ 'Cached field in notes (%s) ' \ 'does not match associated file (%s) ' \ 'in notes file %r' % (meta['cached'], d.cached, 'file://' + d.d/'notes.org') # TODO: consider using two fields: display name and searchable # name. to avoid the issues with accents w.update_document(source = meta['source'], cached = str(d.cached), hash = d.hash(), title = meta['title'], author = ' ; '.join(meta['author']), year = meta['year'], notes = meta['notes'], text = d.text(), mtime = d.modified, added = d.added, tags = ' '.join(meta['tags']))
def data(verbose=True): """ Get a list of skid pdfs which have authors annotated. """ for filename in iterview(CACHE.glob('*.pdf')): d = Document(filename) meta = d.parse_notes() if meta['author']: if verbose: ff = ' file://' + filename print print red % ('#' + '_' * len(ff)) print red % ('#' + ff) print print('%s: %s' % (yellow % 'meta', meta['title'])).encode('utf8') print('%s: %s' % (yellow % 'meta', ' ; '.join( meta['author']))).encode('utf8') print try: yield (meta, d, pdfminer(filename)) except Exception: # XXX: silently skips examples which cause pdfminer to throw an # exception. pass
def update(): "Update index." # create index if it doesn't exist if not DIRECTORY.exists(): create() # get handle to Whoosh index ix = open_dir(DIRECTORY, NAME) with ix.writer() as w, ix.searcher() as searcher: # sort cached files by mtime. files = [Document(f) for f in CACHE.files()] files.sort(key=(lambda x: x.modified), reverse=True) for d in files: # lookup document mtime in the index; don't add or extract info if # you don't need it. result = searcher.find('cached', unicode(d.cached)) if not result: print '[INFO] new document', d.cached else: assert len(result) == 1, 'cached should be unique.' result = result[0] if d.modified <= result['mtime']: # already up to date # Since we've sorted files by mtime, we know that files # after this one are older, and thus we're done. return print '[INFO] update to existing document:', d.cached meta = d.parse_notes() # just a lint check assert meta['cached'] == d.cached, \ 'Cached field in notes (%s) ' \ 'does not match associated file (%s) ' \ 'in notes file %r' % (meta['cached'], d.cached, 'file://' + d.d/'notes.org') # TODO: consider using two fields: display name and searchable # name. to avoid the issues with accents w.update_document(source=meta['source'], cached=unicode(d.cached), hash=d.hash(), title=meta['title'], author=u' ; '.join(meta['author']), year=meta['year'], notes=meta['notes'], text=d.text(), mtime=d.modified, added=d.added, tags=u' '.join(meta['tags']))
def data(): ix = defaultdict(list) docs = [] # documents with authors annotated for filename in CACHE.glob('*.pdf'): d = Document(filename) d.meta = d.parse_notes() authors = d.meta['author'] if authors: docs.append(d) for x in authors: ix[simplify(x)].append(d) return ix, docs
def data(verbose=True): """ Get a list of skid pdfs which have authors annotated. """ for filename in iterview(CACHE.glob('*.pdf')): d = Document(filename) meta = d.parse_notes() if meta['author']: if verbose: ff = ' file://' + filename print print red % ('#' + '_' *len(ff)) print red % ('#' + ff) print print ('%s: %s' % (yellow % 'meta', meta['title'])).encode('utf8') print ('%s: %s' % (yellow % 'meta', ' ; '.join(meta['author']))).encode('utf8') print yield (meta, d, pdfminer(filename))
def data(verbose=True): """ Get a list of skid pdfs which have authors annotated. """ for filename in iterview(CACHE.glob('*.pdf')): d = Document(filename) meta = d.parse_notes() if meta['author']: if verbose: ff = ' file://' + filename print() print(colors.red % ('#' + '_' *len(ff))) print(colors.red % ('#' + ff)) print() print(('%s: %s' % (colors.yellow % 'meta', meta['title'])).encode('utf8')) print(('%s: %s' % (colors.yellow % 'meta', ' ; '.join(meta['author']))).encode('utf8')) print() try: yield (meta, d, pdfminer(filename)) except Exception: # XXX: silently skips examples which cause pdfminer to throw an # exception. pass