Beispiel #1
0
def ocr_image(cache, url, codelang):
    # This is checked in bot_listening but must be redone here, so if
    # the ocr for the same page is asked multiple time, we will do the ocr
    # only once.
    text = get_from_cache(cache, url, codelang)
    if text:
        return ret_val(0, text)

    url = url.encode('utf-8')

    cache_key = image_key(url)

    lang = ocr.tesseract_languages.get(codelang, 'eng')

    basename = os.path.expanduser('~/tmp') + '/tesseract/image_%s' % cache_key

    image_filename = basename + ".jpg"

    utils.copy_file_from_url(url, image_filename)
    if not os.path.exists(image_filename):
        return ret_val(1, "could not download url: %s" % url)

    text = ocr.ocr(image_filename, basename, lang)
    if text == None:
        return ret_val(2, "ocr failed")

    os.remove(image_filename)
    if os.path.exists(basename + ".txt"):
        os.remove(basename + ".txt")

    cache.set(cache_key, text)

    return ret_val(0, text)
Beispiel #2
0
def ocr_image(cache, url, codelang):
    # This is checked in bot_listening but must be redone here, so if
    # the ocr for the same page is asked multiple time, we will do the ocr
    # only once.
    text = get_from_cache(cache, url, codelang)
    if text:
        return ret_val(0, text)

    url = url.encode('utf-8')

    cache_key = image_key(url)

    lang = ocr.tesseract_languages.get(codelang, 'eng')

    basename = os.path.expanduser('~/tmp') + '/tesseract/image_%s' % cache_key

    image_filename = basename + ".jpg"

    utils.copy_file_from_url(url, image_filename)
    if not os.path.exists(image_filename):
        return ret_val(1, "could not download url: %s" % url)

    text = ocr.ocr(image_filename, basename, lang)
    if text == None:
        return ret_val(2, "ocr failed")

    os.remove(image_filename)
    if os.path.exists(basename + ".txt"):
        os.remove(basename + ".txt")

    cache.set(cache_key, text)

    return ret_val(0, text)
Beispiel #3
0
def extract_djvu_text(url, filename, sha1):
    print "extracting text layer"

    if type(filename) == type(u''):
        filename = filename.encode('utf-8')

    utils.copy_file_from_url(url, filename, sha1)

    data = []
    # GTK app are very touchy
    os.environ['LANG'] = 'en_US.UTF8'
    # FIXME: check return code
    ls = subprocess.Popen(['djvutxt', filename, '--detail=page'],
                          stdout=subprocess.PIPE,
                          close_fds=True)
    text = ls.stdout.read()
    ls.wait()
    for t in re.finditer(
            u'\((page -?\d+ -?\d+ -?\d+ -?\d+[ \n]+"(.*)"[ ]*|)\)\n', text):
        t = unicode(t.group(1), 'utf-8', 'replace')
        t = re.sub(u'^page \d+ \d+ \d+ \d+[ \n]+"', u'', t)
        t = re.sub(u'"[ ]*$', u'', t)
        t = unquote_text_from_djvu(t)
        data.append(t)

    os.remove(filename)

    return sha1, data
Beispiel #4
0
def extract_djvu_text(url, filename, sha1):
    print "extracting text layer"

    if type(filename) == type(u''):
        filename = filename.encode('utf-8')

    utils.copy_file_from_url(url, filename, sha1)

    data = []
    # GTK app are very touchy
    os.environ['LANG'] = 'en_US.UTF8'
    # FIXME: check return code
    ls = subprocess.Popen([ 'djvutxt', filename, '--detail=page'], stdout=subprocess.PIPE, close_fds = True)
    text = ls.stdout.read()
    ls.wait()
    for t in re.finditer(u'\((page -?\d+ -?\d+ -?\d+ -?\d+[ \n]+"(.*)"[ ]*|)\)\n', text):
        t = unicode(t.group(1), 'utf-8', 'replace')
        t = re.sub(u'^page \d+ \d+ \d+ \d+[ \n]+"', u'', t)
        t = re.sub(u'"[ ]*$', u'', t)
        t = unquote_text_from_djvu(t)
        data.append(t)

    os.remove(filename)

    return sha1, data
Beispiel #5
0
def check_and_upload(url, filename, sha1):
    if not os.path.exists(filename) or utils.sha1(filename) != sha1:
        if not utils.copy_file_from_url(url, filename, sha1):
            return False

    return True
Beispiel #6
0
    if not os.path.exists(out_filename) or ls.returncode:
        # in case returncode == 0
        print >> sys.stderr, "ocr.ocr() fail to exec tesseract:", ls.returncode, filename

        fd = open(out_filename, 'w')
        fd.write('An error occurred during ocr processing: '  + filename)
        fd.close()

    fd = open(out_filename)
    txt = fd.read()
    fd.close()

    if tesseract_data_prefix:
        del os.environ['TESSDATA_PREFIX']

    if ls.returncode != 0:
        print >> sys.stderr, "ocr.ocr() fail to exec tesseract:", ls.returncode, filename
        return None

    return txt

if __name__ == "__main__":
    image_filename = 'temp.jpg'
    url= 'https://upload.wikimedia.org/wikipedia/commons/thumb/f/f4/Petitot_-_Collection_compl%C3%A8te_des_m%C3%A9moires_relatifs_%C3%A0_l%E2%80%99histoire_de_France%2C_2e_s%C3%A9rie%2C_tome_45.djvu/page280-1024px-Petitot_-_Collection_compl%C3%A8te_des_m%C3%A9moires_relatifs_%C3%A0_l%E2%80%99histoire_de_France%2C_2e_s%C3%A9rie%2C_tome_45.djvu.jpg'
    lang = 'fr'
    utils.copy_file_from_url(url, image_filename)
    print ocr(image_filename, image_filename, tesseract_languages[lang], config = 'hocr')

    os.remove(image_filename)
    os.remove(image_filename + ".txt")
Beispiel #7
0
def copy_ia_file(ia_id, metadata):
    base_url = 'https://archive.org/download/%s/' % ia_id

    utils.copy_file_from_url(base_url + metadata['name'], metadata['name'],
                             expect_sha1 = metadata['sha1'])
Beispiel #8
0
def check_and_upload(url, filename, sha1):
    if not os.path.exists(filename) or utils.sha1(filename) != sha1:
        if not utils.copy_file_from_url(url, filename, sha1):
            return False

    return True