コード例 #1
0
ファイル: index.py プロジェクト: timvieira/skid
def update():
    "Update index."

    # create index if it doesn't exist
    if not DIRECTORY.exists():
        create()

    # get handle to Whoosh index
    ix = open_dir(DIRECTORY, NAME)

    with ix.writer() as w, ix.searcher() as searcher:

        # sort cached files by mtime.
        files = [Document(f) for f in CACHE.files()]
        files.sort(key = (lambda x: x.modified), reverse=True)

        for d in files:

            # lookup document mtime in the index; don't add or extract info if
            # you don't need it.
            result = searcher.find('cached', str(d.cached))

            if not result:
                print('[INFO] new document', d.cached)

            else:
                assert len(result) == 1, 'cached should be unique.'
                result = result[0]
                if d.modified <= result['mtime']:   # already up to date

                    # Since we've sorted files by mtime, we know that files
                    # after this one are older, and thus we're done.
                    return

                print('[INFO] update to existing document:', d.cached)

            meta = d.parse_notes()

            # just a lint check
            assert meta['cached'] == d.cached, \
                'Cached field in notes (%s) ' \
                'does not match associated file (%s) ' \
                'in notes file %r' % (meta['cached'],
                                      d.cached,
                                      'file://' + d.d/'notes.org')

            # TODO: consider using two fields: display name and searchable
            # name. to avoid the issues with accents

            w.update_document(source = meta['source'],
                              cached = str(d.cached),
                              hash = d.hash(),
                              title = meta['title'],
                              author = ' ; '.join(meta['author']),
                              year = meta['year'],
                              notes = meta['notes'],
                              text = d.text(),
                              mtime = d.modified,
                              added = d.added,
                              tags = ' '.join(meta['tags']))
コード例 #2
0
def data(verbose=True):
    """
    Get a list of skid pdfs which have authors annotated.
    """
    for filename in iterview(CACHE.glob('*.pdf')):
        d = Document(filename)
        meta = d.parse_notes()
        if meta['author']:
            if verbose:
                ff = ' file://' + filename
                print
                print red % ('#' + '_' * len(ff))
                print red % ('#' + ff)
                print
                print('%s: %s' %
                      (yellow % 'meta', meta['title'])).encode('utf8')
                print('%s: %s' % (yellow % 'meta', ' ; '.join(
                    meta['author']))).encode('utf8')
                print
            try:
                yield (meta, d, pdfminer(filename))
            except Exception:
                # XXX: silently skips examples which cause pdfminer to throw an
                # exception.
                pass
コード例 #3
0
def update():
    "Update index."

    # create index if it doesn't exist
    if not DIRECTORY.exists():
        create()

    # get handle to Whoosh index
    ix = open_dir(DIRECTORY, NAME)

    with ix.writer() as w, ix.searcher() as searcher:

        # sort cached files by mtime.
        files = [Document(f) for f in CACHE.files()]
        files.sort(key=(lambda x: x.modified), reverse=True)

        for d in files:

            # lookup document mtime in the index; don't add or extract info if
            # you don't need it.
            result = searcher.find('cached', unicode(d.cached))

            if not result:
                print '[INFO] new document', d.cached

            else:
                assert len(result) == 1, 'cached should be unique.'
                result = result[0]
                if d.modified <= result['mtime']:  # already up to date

                    # Since we've sorted files by mtime, we know that files
                    # after this one are older, and thus we're done.
                    return

                print '[INFO] update to existing document:', d.cached

            meta = d.parse_notes()

            # just a lint check
            assert meta['cached'] == d.cached, \
                'Cached field in notes (%s) ' \
                'does not match associated file (%s) ' \
                'in notes file %r' % (meta['cached'],
                                      d.cached,
                                      'file://' + d.d/'notes.org')

            # TODO: consider using two fields: display name and searchable
            # name. to avoid the issues with accents

            w.update_document(source=meta['source'],
                              cached=unicode(d.cached),
                              hash=d.hash(),
                              title=meta['title'],
                              author=u' ; '.join(meta['author']),
                              year=meta['year'],
                              notes=meta['notes'],
                              text=d.text(),
                              mtime=d.modified,
                              added=d.added,
                              tags=u' '.join(meta['tags']))
コード例 #4
0
ファイル: authors.py プロジェクト: timvieira/skid
def data():
    ix = defaultdict(list)
    docs = []  # documents with authors annotated

    for filename in CACHE.glob('*.pdf'):
        d = Document(filename)
        d.meta = d.parse_notes()
        authors = d.meta['author']
        if authors:
            docs.append(d)
            for x in authors:
                ix[simplify(x)].append(d)

    return ix, docs
コード例 #5
0
def data():
    ix = defaultdict(list)
    docs = []  # documents with authors annotated

    for filename in CACHE.glob('*.pdf'):
        d = Document(filename)
        d.meta = d.parse_notes()
        authors = d.meta['author']
        if authors:
            docs.append(d)
            for x in authors:
                ix[simplify(x)].append(d)

    return ix, docs
コード例 #6
0
ファイル: skid-data.py プロジェクト: pombredanne/skid
def data(verbose=True):
    """
    Get a list of skid pdfs which have authors annotated.
    """
    for filename in iterview(CACHE.glob('*.pdf')):
        d = Document(filename)
        meta = d.parse_notes()
        if meta['author']:
            if verbose:
                ff = ' file://' + filename
                print
                print red % ('#' + '_' *len(ff))
                print red % ('#' + ff)
                print
                print ('%s: %s' % (yellow % 'meta', meta['title'])).encode('utf8')
                print ('%s: %s' % (yellow % 'meta', ' ; '.join(meta['author']))).encode('utf8')
                print
            yield (meta, d, pdfminer(filename))
コード例 #7
0
ファイル: skid-data.py プロジェクト: timvieira/skid
def data(verbose=True):
    """
    Get a list of skid pdfs which have authors annotated.
    """
    for filename in iterview(CACHE.glob('*.pdf')):
        d = Document(filename)
        meta = d.parse_notes()
        if meta['author']:
            if verbose:
                ff = ' file://' + filename
                print()
                print(colors.red % ('#' + '_' *len(ff)))
                print(colors.red % ('#' + ff))
                print()
                print(('%s: %s' % (colors.yellow % 'meta', meta['title'])).encode('utf8'))
                print(('%s: %s' % (colors.yellow % 'meta', ' ; '.join(meta['author']))).encode('utf8'))
                print()
            try:
                yield (meta, d, pdfminer(filename))
            except Exception:
                # XXX: silently skips examples which cause pdfminer to throw an
                # exception.
                pass