Example #1
0
def page_index(request):
    db = app.get_feature('document_storage').default_db
    pages = Page.objects(db).order_by('date_time')
    for p in pages:
        print p.pk, p, p.summary, p.image
    return {'pages': pages,
            'thumbnail': _get_thumbnail}
Example #2
0
def add_pages(args):
    _args_to_unicode(args, ['language', 'summary', 'summary_prefix'])
    db = app.get_feature('document_storage').default_db

    # check if the files exist
    for path in args.paths:
        assert os.path.exists(path)

    # import
    for path in args.paths:
        yield '* importing {0} (language {1})'.format(path, args.language)

        fingerprint = get_file_hash(open(path, 'rb'))

        # check file hash uniqueness
        if Page.objects(db).where(source_fingerprint=fingerprint):
            yield '...already in the database.'
            continue

        p = Page()

        p.summary = args.summary or get_summary_from_path(path)
        if args.summary_prefix:
            p.summary = u'{0} {1}'.format(args.summary_prefix, p.summary)
        p.language = args.language or None
        p.source_fingerprint = fingerprint
        if not args.no_ocr:
            try:
                p.details = image_to_text(path=path, language=p.language)
            except RuntimeError as e:
                if not args.skip_ocr_errors:
                    raise CommandError(e)
                yield '(OCR failed, saving only image itself)'

        # usually we don't need heavy formats like ppm or tiff even for OCR
        img = Image.open(path)
        if args.format:
            fmt = args.format
        elif img.format not in IMAGE_FORMATS:
            fmt = IMAGE_FORMATS[0]
        else:
            fmt = img.format
        img.save(TMP_FILENAME, fmt)
        p['image'] = open(TMP_FILENAME, 'rb')
        # provide original path so that the resulting filename is inherited
        p['image'].path = path

        p.save(db)