Example #1
0
def ocr_image(cache, url, codelang):
    # This is checked in bot_listening but must be redone here, so if
    # the ocr for the same page is asked multiple time, we will do the ocr
    # only once.
    text = get_from_cache(cache, url, codelang)
    if text:
        return ret_val(0, text)

    url = url.encode('utf-8')

    cache_key = image_key(url)

    lang = ocr.tesseract_languages.get(codelang, 'eng')

    basename = os.path.expanduser('~/tmp') + '/tesseract/image_%s' % cache_key

    image_filename = basename + ".jpg"

    utils.copy_file_from_url(url, image_filename)
    if not os.path.exists(image_filename):
        return ret_val(1, "could not download url: %s" % url)

    text = ocr.ocr(image_filename, basename, lang)
    if text == None:
        return ret_val(2, "ocr failed")

    os.remove(image_filename)
    if os.path.exists(basename + ".txt"):
        os.remove(basename + ".txt")

    cache.set(cache_key, text)

    return ret_val(0, text)
Example #2
0
def extract_djvu_text(url, filename, sha1):
    print "extracting text layer"

    if type(filename) == type(u''):
        filename = filename.encode('utf-8')

    utils.copy_file_from_url(url, filename, sha1)

    data = []
    # GTK app are very touchy
    os.environ['LANG'] = 'en_US.UTF8'
    # FIXME: check return code
    ls = subprocess.Popen([ 'djvutxt', filename, '--detail=page'], stdout=subprocess.PIPE, close_fds = True)
    text = ls.stdout.read()
    ls.wait()
    for t in re.finditer(u'\((page -?\d+ -?\d+ -?\d+ -?\d+[ \n]+"(.*)"[ ]*|)\)\n', text):
        t = unicode(t.group(1), 'utf-8', 'replace')
        t = re.sub(u'^page \d+ \d+ \d+ \d+[ \n]+"', u'', t)
        t = re.sub(u'"[ ]*$', u'', t)
        t = unquote_text_from_djvu(t)
        data.append(t)

    os.remove(filename)

    return sha1, data
def ocr_image(cache, url, codelang):
    # This is checked in bot_listening but must be redone here, so if
    # the ocr for the same page is asked multiple time, we will do the ocr
    # only once.
    text = get_from_cache(cache, url, codelang)
    if text:
        return ret_val(0, text)

    url = url.encode('utf-8')

    cache_key = image_key(url)

    lang = ocr.tesseract_languages.get(codelang, 'eng')

    basename = os.path.expanduser('~/tmp') + '/tesseract/image_%s' % cache_key

    image_filename = basename + ".jpg"

    utils.copy_file_from_url(url, image_filename)
    if not os.path.exists(image_filename):
        return ret_val(1, "could not download url: %s" % url)

    text = ocr.ocr(image_filename, basename, lang)
    if text == None:
        return ret_val(2, "ocr failed")

    os.remove(image_filename)
    if os.path.exists(basename + ".txt"):
        os.remove(basename + ".txt")

    cache.set(cache_key, text)

    return ret_val(0, text)
Example #4
0
        fd.write('An error occurred during ocr processing: ' + filename)
        fd.close()

    fd = open(out_filename)
    txt = fd.read()
    fd.close()

    if tesseract_data_prefix:
        del os.environ['TESSDATA_PREFIX']

    if ls.returncode != 0:
        print >> sys.stderr, "ocr.ocr() fail to exec tesseract:", ls.returncode, filename
        return None

    return txt


if __name__ == "__main__":
    import os
    image_filename = 'temp.jpg'
    url = 'https://upload.wikimedia.org/wikipedia/commons/thumb/f/f4/Petitot_-_Collection_compl%C3%A8te_des_m%C3%A9moires_relatifs_%C3%A0_l%E2%80%99histoire_de_France%2C_2e_s%C3%A9rie%2C_tome_45.djvu/page280-1024px-Petitot_-_Collection_compl%C3%A8te_des_m%C3%A9moires_relatifs_%C3%A0_l%E2%80%99histoire_de_France%2C_2e_s%C3%A9rie%2C_tome_45.djvu.jpg'
    lang = 'fr'
    utils.copy_file_from_url(url, image_filename)
    print ocr(image_filename,
              image_filename,
              tesseract_languages[lang],
              config='hocr')

    os.remove(image_filename)
    os.remove(image_filename + ".txt")
Example #5
0
        # in case returncode == 0
        print >> sys.stderr, "ocr.ocr() fail to exec tesseract:", ls.returncode, filename

        fd = open(out_filename, 'w')
        fd.write('An error occurred during ocr processing: '  + filename)
        fd.close()

    fd = open(out_filename)
    txt = fd.read()
    fd.close()

    if tesseract_data_prefix:
        del os.environ['TESSDATA_PREFIX']

    if ls.returncode != 0:
        print >> sys.stderr, "ocr.ocr() fail to exec tesseract:", ls.returncode, filename
        return None

    return txt

if __name__ == "__main__":
    import os
    image_filename = 'temp.jpg'
    url= 'https://upload.wikimedia.org/wikipedia/commons/thumb/f/f4/Petitot_-_Collection_compl%C3%A8te_des_m%C3%A9moires_relatifs_%C3%A0_l%E2%80%99histoire_de_France%2C_2e_s%C3%A9rie%2C_tome_45.djvu/page280-1024px-Petitot_-_Collection_compl%C3%A8te_des_m%C3%A9moires_relatifs_%C3%A0_l%E2%80%99histoire_de_France%2C_2e_s%C3%A9rie%2C_tome_45.djvu.jpg'
    lang = 'fr'
    utils.copy_file_from_url(url, image_filename)
    print ocr(image_filename, image_filename, tesseract_languages[lang], config = 'hocr')

    os.remove(image_filename)
    os.remove(image_filename + ".txt")
Example #6
0
def copy_file(lang, family, filename, dest):
    site = pywikibot.getSite(lang, family)
    page = get_filepage(site, unicode(filename, 'utf-8'))
    url = page.fileUrl()
    utils.copy_file_from_url(url, dest, page.getFileSHA1Sum())
Example #7
0
def check_and_upload(url, filename, sha1):
    if not os.path.exists(filename) or utils.sha1(filename) != sha1:
        if not utils.copy_file_from_url(url, filename, sha1):
            return False

    return True
Example #8
0
def check_and_upload(url, filename, sha1):
    if not os.path.exists(filename) or utils.sha1(filename) != sha1:
        if not utils.copy_file_from_url(url, filename, sha1):
            return False

    return True
        os.environ['GSDJVU'] = gsdjvu

    out_file = in_file[:-3] + 'djvu'

    djvudigital = djvulibre_path + 'djvudigital'
    # --words option is useless as many pdf contains text layer only for
    # the first page
    ls = subprocess.Popen([ djvudigital, "--dpi=300", in_file, out_file], stdout=subprocess.PIPE, preexec_fn=setrlimits, close_fds = True)
    text = ls.stdout.read()
    if text:
        print text
    ls.wait()
    if ls.returncode != 0:
        print >> sys.stderr, "djvudigital fail: ", ls.returncode, in_file
        out_file = None

    if gsdjvu:
        del os.environ['GSDJVU']

    return out_file


if __name__ == "__main__":
    import utils
    in_file = 'https://upload.wikimedia.org/wikipedia/commons/8/81/Accord_compl%C3%A9mentaire_relatif_%C3%A0_la_Malaisie_le_11_Septembre_1963.pdf'
    out_file = os.path.expanduser('~/tmp/')  + 'Accord complémentaire relatif à la Malaisie le 11 Septembre 1963.pdf'
    utils.copy_file_from_url(in_file, out_file)
    djvu_name = pdf_to_djvu(out_file)
    os.remove(out_file)
    #os.remove(djvu_name)
Example #10
0
def copy_file(lang, family, filename, dest):
    site = pywikibot.getSite(lang, family)
    page = get_filepage(site, unicode(filename, 'utf-8'))
    url = page.fileUrl()
    utils.copy_file_from_url(url, dest, page.getFileSHA1Sum())