Ejemplo n.º 1
0
def handle_query(params, start_response):
    print >> sys.stderr, params

    if params['lang'] and params['book']:
        try:
            ret_code = '200 OK'
            result = hocr.get_hocr(params['lang'], params['book'])
        except:
            utils.print_traceback()
            ret_code = '500 Internal Server Error'
            result = { 'error' : 1, 'text' : ret_code }
    else:
        ret_code = '400 Bad Request'
        result = { 'error' : 1, 'text' : ret_code }

    try:
        text = json.dumps(result)
    except UnicodeDecodeError:
        print >> sys.stderr, result
        ret_code = '400 Bad Request'
        text = json.dumps({ 'error' : 1, 'text' : ret_code })

    start_response(ret_code, [('Content-Type',
                               'application/json' + '; charset=UTF-8'),
                              ('Content-Length', str(len(text))),
                              ('Access-Control-Allow-Origin', '*')])
    return [ text ]
Ejemplo n.º 2
0
def extract_image(opt, page_nr, filename):
    try:
        width, height = image_size(page_nr, filename)

        subsample = 1
        while (width*height) / subsample > (1 << 20) * 50:
            subsample += 1

        subsample = min(subsample, 12)
    except Exception:
        utils.print_traceback("Unable to get image size, subsample=1", filename)
        subsample = 1

    if subsample != 1:
        print "subsample", subsample

    tiff_name = opt.out_dir + 'page_%04d.tif' % page_nr
    ddjvu = djvulibre_path + 'ddjvu'
    ls = subprocess.Popen([ ddjvu, "-format=tiff", "-page=%d" % page_nr, "-subsample=%d" % subsample, filename, tiff_name], stdout=subprocess.PIPE, preexec_fn=setrlimits, close_fds = True)
    text = utils.safe_read(ls.stdout)
    if text:
        print text
    ls.wait()
    if ls.returncode != 0:
        print >> sys.stderr, "extract_image fail: ", ls.returncode, filename, page_nr
        return None
    return tiff_name
Ejemplo n.º 3
0
def handle_query(params, start_response):
    log(params)

    if params['lang'] and params['book']:
        try:
            ret_code = '200 OK'
            result = hocr.get_hocr(params['lang'], params['book'])
        except:
            utils.print_traceback()
            ret_code = '500 Internal Server Error'
            result = {'error': 1, 'text': ret_code}
    else:
        ret_code = '400 Bad Request'
        result = {'error': 1, 'text': ret_code}

    try:
        text = json.dumps(result)
    except UnicodeDecodeError:
        log(result)
        ret_code = '400 Bad Request'
        text = json.dumps({'error': 1, 'text': ret_code})

    start_response(ret_code,
                   [('Content-Type', 'application/json' + '; charset=UTF-8'),
                    ('Content-Length', str(len(text))),
                    ('Access-Control-Allow-Origin', '*')])
    return [text]
Ejemplo n.º 4
0
def extract_image(opt, page_nr, filename):
    try:
        width, height = image_size(page_nr, filename)

        subsample = 1
        while (width * height) / subsample > (1 << 20) * 50:
            subsample += 1

        subsample = min(subsample, 12)
    except Exception:
        utils.print_traceback("Unable to get image size, subsample=1",
                              filename)
        subsample = 1

    if subsample != 1:
        print "subsample", subsample

    tiff_name = opt.temp_tiff_dir + '/page_%04d.tif' % page_nr
    ddjvu = djvulibre_path + 'ddjvu'
    ls = subprocess.Popen([
        ddjvu, "-format=tiff",
        "-page=%d" % page_nr,
        "-subsample=%d" % subsample, filename, tiff_name
    ],
                          stdout=subprocess.PIPE,
                          preexec_fn=setrlimits,
                          close_fds=True)
    text = utils.safe_read(ls.stdout)
    if text:
        print text
    ls.wait()
    if ls.returncode != 0:
        print >> sys.stderr, "extract_image fail: ", ls.returncode, filename, page_nr
        return None
    return tiff_name
Ejemplo n.º 5
0
def parse(opt, filename):
    try:
        ret_code = do_parse(opt, filename)
    except Exception:
        utils.print_traceback(filename)
        ret_code = -1

    return ret_code
Ejemplo n.º 6
0
def do_file(job_queue, opt, filename):
    while True:
        page_nr = job_queue.get()
        if page_nr == None:
            print "Stopping thread"
            return
        try:
            do_one_page(opt, page_nr, filename)
        except Exception:
            utils.print_traceback(filename)
Ejemplo n.º 7
0
def do_file(job_queue, opt, filename):
    while True:
        page_nr = job_queue.get()
        if page_nr == None:
            print "Stopping thread"
            return
        try:
            do_one_page(opt, page_nr, filename)
        except Exception:
            utils.print_traceback(filename)
Ejemplo n.º 8
0
def handle_suggest_query(params, start_response):
    if params['lang'] and params['title']:
        try:
            modernize = modernization.Modernization(params['lang'])
            result = modernize.suggest_dict(params['title'])
            ret_code = '200 OK'
        except:
            utils.print_traceback()
            ret_code = '500 Internal Server Error'
            result = { 'error' : 1, 'text' : ret_code }
    else:
        ret_code = '400 Bad Request'
        result = { 'error' : 1, 'text' : ret_code }

    return return_response(start_response, result, True, ret_code, 'application/json')
Ejemplo n.º 9
0
def handle_suggest_query(params, start_response):
    if params['lang'] and params['title']:
        try:
            modernize = modernization.Modernization(params['lang'])
            result = modernize.suggest_dict(params['title'])
            ret_code = '200 OK'
        except:
            utils.print_traceback()
            ret_code = '500 Internal Server Error'
            result = {'error': 1, 'text': ret_code}
    else:
        ret_code = '400 Bad Request'
        result = {'error': 1, 'text': ret_code}

    return return_response(start_response, result, True, ret_code,
                           'application/json')
Ejemplo n.º 10
0
def html_for_queue(queue):
    html = u''
    for i in queue:
        mtitle = i[0]
        codelang = i[1]
        try:
            msite = pywikibot.getSite(codelang, 'wikisource')
            page = pywikibot.Page(msite, mtitle)
            path = msite.nice_get_address(page.title(asUrl = True))
            url = '%s://%s%s' % (msite.protocol(), msite.hostname(), path)
        except BaseException:
            utils.print_traceback()
            url = ""

        html += date_s(i[3])+' '+i[2]+" "+i[1]+" <a href=\""+url+"\">"+i[0]+"</a><br/>"
    return html
Ejemplo n.º 11
0
def handle_blacklist_query(params, start_response):
    if params['lang'] and params['blacklist']:
        try:
            modernize = modernization.Modernization(params['lang'])
            blacklist = json.loads(params['blacklist'])
            modernize.save_blacklist(blacklist)
            ret_code = '200 OK'
            result = { 'error' : 0, 'text' :'OK' }
        except:
            utils.print_traceback()
            ret_code = '500 Internal Server Error'
            result = { 'error' : 1, 'text' : ret_code }
    else:
        ret_code = '400 Bad Request'
        result = { 'error' : 1, 'text' : ret_code }

    return return_response(start_response, result, True, ret_code, 'application/json')
Ejemplo n.º 12
0
def handle_blacklist_query(params, start_response):
    if params['lang'] and params['blacklist']:
        try:
            modernize = modernization.Modernization(params['lang'])
            blacklist = json.loads(params['blacklist'])
            modernize.save_blacklist(blacklist)
            ret_code = '200 OK'
            result = {'error': 0, 'text': 'OK'}
        except:
            utils.print_traceback()
            ret_code = '500 Internal Server Error'
            result = {'error': 1, 'text': ret_code}
    else:
        ret_code = '400 Bad Request'
        result = {'error': 1, 'text': ret_code}

    return return_response(start_response, result, True, ret_code,
                           'application/json')
Ejemplo n.º 13
0
def get_djvu(cache, mysite, djvuname, check_timestamp=False):

    print "get_djvu", repr(djvuname)

    djvuname = djvuname.replace(" ", "_")
    cache_filename = djvuname + '.dat'

    obj = cache.get(cache_filename)
    if not obj:
        print "CACHE MISS"
        filepage = copy_File.get_filepage(mysite, djvuname)
        if not filepage:
            # can occur if File: has been deleted
            return None
        try:
            url = filepage.fileUrl()
            obj = extract_djvu_text(url, djvuname, filepage.getFileSHA1Sum())
        except:
            utils.print_traceback("extract_djvu_text() fail")
            obj = None
        if obj:
            cache.set(cache_filename, obj)
        else:
            return None
    else:
        if check_timestamp:
            filepage = copy_File.get_filepage(mysite, djvuname)
            if not filepage:
                # can occur if File: has been deleted
                return None
            sha1 = filepage.getFileSHA1Sum()
            if sha1 != obj[0]:
                print "OUTDATED FILE"
                url = filepage.fileUrl()
                try:
                    obj = extract_djvu_text(url, djvuname, sha1)
                    cache.set(cache_filename, obj)
                except:
                    return None

    return obj[1]
Ejemplo n.º 14
0
def get_djvu(cache, mysite, djvuname, check_timestamp = False):

    print "get_djvu", repr(djvuname)

    djvuname = djvuname.replace(" ", "_")
    cache_filename = djvuname + '.dat'

    obj = cache.get(cache_filename)
    if not obj:
        print "CACHE MISS"
        filepage = copy_File.get_filepage(mysite, djvuname)
        if not filepage:
            # can occur if File: has been deleted
            return None
        try:
            url = filepage.fileUrl()
            obj = extract_djvu_text(url, djvuname, filepage.getFileSHA1Sum())
        except:
            utils.print_traceback("extract_djvu_text() fail")
            obj = None
        if obj:
            cache.set(cache_filename, obj)
        else:
            return None
    else:
        if check_timestamp:
            filepage = copy_File.get_filepage(mysite, djvuname)
            if not filepage:
                # can occur if File: has been deleted
                return None
            sha1 = filepage.getFileSHA1Sum()
            if sha1 != obj[0]:
                print "OUTDATED FILE"
                url = filepage.fileUrl()
                try:
                    obj = extract_djvu_text(url, djvuname, sha1)
                    cache.set(cache_filename, obj)
                except:
                    return None

    return obj[1]
Ejemplo n.º 15
0
    def exec_request(self, r):
        sge_job_nr = 0

        # This is a bit convoluted but we need it to avoid a race condition:
        # we set the job as running before starting it so on if this script
        # run twice in parallel we don't try to start the same job twice. Then
        # when the job really started or fail to start we update its state
        # again. As we don't know yet the sge job number, we setup it as zero.
        # Note this could be done in pending_request() but I prefer to protect
        # it locally.
        really_pending = False
        with db.connection(self):
            q = 'UPDATE job SET job_state=%s, sge_jobnumber=%s WHERE job_id=%s AND job_state="pending"'
            if self.cursor.execute(q, [ 'running', 0, r['job_id'] ]):
                really_pending = True

        if not really_pending:
            print >> sys.stderr, "run request for job_id %s cancelled, as it's no longer pending" % r['job_id']
            return

        cmdline_arg = job_cmdline_arg(r, 'job_run_cmd')
        sge_cmdline = sge_cmdline_arg(r)
        ls = subprocess.Popen(sge_cmdline + cmdline_arg,
                              stdin=None, stdout=subprocess.PIPE,
                              close_fds = True)
        text = ls.stdout.read()
        ls.wait()
        try:
            sge_job_nr = int(re.search('Your job (\d+) ', text).group(1))
            new_state = 'running'
        except:
            utils.print_traceback("sge failure to exec job: %d" % r['job_id'], text)
            new_state = 'sge_fail'


        # Now we can really update the job state, see comment above.
        with db.connection(self):
            q = 'UPDATE job SET job_state=%s, sge_jobnumber=%s WHERE job_id=%s'
            self.cursor.execute(q, [ new_state, sge_job_nr, r['job_id'] ])
Ejemplo n.º 16
0
    def exec_request(self, r):
        sge_job_nr = 0

        # This is a bit convoluted but we need it to avoid a race condition:
        # we set the job as running before starting it so on if this script
        # run twice in parallel we don't try to start the same job twice. Then
        # when the job really started or fail to start we update its state
        # again. As we don't know yet the sge job number, we setup it as zero.
        # Note this could be done in pending_request() but I prefer to protect
        # it locally.
        really_pending = False
        with db.connection(self):
            q = 'UPDATE job SET job_state=%s, sge_jobnumber=%s WHERE job_id=%s AND job_state="pending"'
            if self.cursor.execute(q, [ 'running', 0, r['job_id'] ]):
                really_pending = True

        if not really_pending:
            print >> sys.stderr, "run request for job_id %s cancelled, as it's no longer pending" % r['job_id']
            return

        cmdline_arg = job_cmdline_arg(r, 'job_run_cmd')
        sge_cmdline = sge_cmdline_arg(r)
        ls = subprocess.Popen(sge_cmdline + cmdline_arg,
                              stdin=None, stdout=subprocess.PIPE,
                              close_fds = True)
        text = ls.stdout.read()
        ls.wait()
        try:
            sge_job_nr = int(re.search('Your job (\d+) ', text).group(1))
            new_state = 'running'
        except:
            utils.print_traceback("sge failure to exec job: %d" % r['job_id'], text)
            new_state = 'sge_fail'


        # Now we can really update the job state, see comment above.
        with db.connection(self):
            q = 'UPDATE job SET job_state=%s, sge_jobnumber=%s WHERE job_id=%s'
            self.cursor.execute(q, [ new_state, sge_job_nr, r['job_id'] ])
Ejemplo n.º 17
0
def handle_scan_query(params, start_response):
    text = common_html.get_head('pages without scan', css = 'shared.css').encode('utf-8') + '\n  <body>\n'

    if params['lang']:
        try:
            offset = int(params.get('offset', 0))
            limit = min(500, int(params.get('limit', 500)))
            lang = params['lang']
            conn = db.create_conn(domain = lang, family = 'wikisource')
            cursor = db.use_db(conn, domain = lang, family = 'wikisource')
            ns = ws_category.domain_urls[lang][0]
            result = pages_without_scan(ns, cursor)
            result_len = len(result)
            result = result[offset:offset+limit]
            result = [( unicode(x[0], 'utf-8'), x[1]) for x in result]
            text += 'Total: ' + str(result_len) + '<br />'
            next_link = prev_next_link(False, result_len, lang, limit, offset)
            prev_link = prev_next_link(True, result_len, lang, limit, offset)
            text += prev_link + '&#160;' + next_link + '<br /><br />'

            for x in result:
                text += u'<a href="//%s.wikisource.org/wiki/%s">' % (lang, x[0]) +  x[0].replace('_', ' ') + u'</a>, ' + str(x[1]) + u'<br />'

            text += u'<br />' + prev_link + '&#160;' + next_link
            cursor.close()
            conn.close()
            ret_code = '200 OK'
        except:
            utils.print_traceback()
            ret_code = '500 Internal Server Error'
            text = '<h1>' + ret_code + '</h1>'
    else:
        ret_code = '400 Bad Request'
        text = '<h1>' + ret_code + '</h1>'

    text += '  </body>\n</html>'

    return return_response(start_response, text.encode('utf-8'), False, ret_code, 'text/html')
Ejemplo n.º 18
0
        print >> sys.stderr, 'missing option -lang: and/or -book:', sys.argv
        exit(1)

    ret = is_uptodate(options.lang, options.book)
    if ret > 0:
        if not hocr(options):
            print >> sys.stderr, 'Error, hocr fail'
            ret = 2
        else:
            update_db(options.lang, options.book)
            ret = 0
    elif ret < 0:
        print >> sys.stderr, "Error, file doesn't exist:", ret
        ret = 3 + abs(ret)
    else:
        update_db(options.lang, options.book)

    return ret

if __name__ == '__main__':
    cache_dir = 'hocr'
    if not os.path.exists(os.path.expanduser('~/cache/' + cache_dir)):
        os.mkdir(os.path.expanduser('~/cache/' + cache_dir))
    try:
        ret = main()
    except:
        utils.print_traceback()
        exit(4)

    exit(ret)
Ejemplo n.º 19
0
def do_match(mysite, maintitle, user, codelang):
    prefix = page_prefixes['wikisource'].get(codelang)
    if not prefix:
        return ret_val(E_ERROR, "no prefix")

    page = pywikibot.Page(mysite, maintitle)
    try:
        text = page.get()
    except:
        utils.print_traceback("failed to get page")
        return ret_val(E_ERROR, "failed to get page")

    if text.find("{{R2Mondes")!=-1:
        global pl_dict
        pl_dict = {}
        p0 = re.compile("\{\{R2Mondes\|(\d+)\|(\d+)\|(\d+)\}\}\s*\n")
        try:
            new_text = p0.sub(repl, text)
        except pywikibot.NoPage:
            return ret_val(E_ERROR, "Erreur : impossible de trouver l'index")
        p = re.compile('==\[\[Page:([^=]+)\]\]==\n')

        cache = lifo_cache.LifoCache('match_and_split_text_layer')
        bl= p.split(new_text)
        for i in range(len(bl)/2):
            title  = bl[i*2+1]
            content = bl[i*2+2]
            filename, pagenum = title.split('/')
            if i == 0:
                cached_text = align.get_djvu(cache, mysite, filename, True)
            else:
                cached_text = align.get_djvu(cache, mysite, filename, False)
            if not cached_text:
                return ret_val(E_ERROR, "Erreur : fichier absent")
            if content.find("R2Mondes") != -1:
                p0 = re.compile("\{\{R2Mondes\|\d+\|\d+\|(\d+)\}\}\s*\n")
                bl0 = p0.split(text)
                title0 = bl0[i*2+1].encode("utf8")
                return ret_val(E_ERROR, "Erreur : Syntaxe 'R2Mondes' incorrecte, dans la page "+title0)
            r = align.match_page(content, cached_text[int(pagenum)-1])
            print "%s %s  : %f"%(filename, pagenum, r)
            if r < 0.1:
                return ret_val(E_ERROR, "Erreur : Le texte ne correspond pas, page %s" % pagenum)
        #the page is ok
        new_text = re.sub(u'<references[ ]*/>', u'', new_text)
        new_text = re.sub(u'[ ]([,])', u'\\1', new_text)
        new_text = re.sub(u'([^.])[ ]([,.])', u'\\1\\2', new_text)
        new_text = re.sub(u'\.\.\.', u'…', new_text)

        new_text = re.sub(u'([^ \s])([;:!?])', u'\\1 \\2', new_text)
        new_text = re.sub(u'([«;:!?])([^ \s…])', u'\\1 \\2', new_text)
        # separated from the previous regexp else "word!»" overlap
        new_text = re.sub(u'([^ \s])([»])', u'\\1 \\2', new_text)

        # workaround some buggy text
        new_text = re.sub(u'([;:!?»]) \n', u'\\1\n', new_text)
        new_text = re.sub(u'([;:!?»])\'\'([ \n])', u'\\1\'\'\\2', new_text)
        # <&nbsp;><space>
        #new_text = re.sub(u'  ([;:!?»])', u' \\1', new_text)
        #new_text = re.sub(u' ([;:!?»])', u' \\1', new_text)
        new_text = re.sub(u'([;:!?»]) <br />', u'\\1<br />', new_text)
        new_text = new_text.replace(u'Page : ', u'Page:')
        new_text = new_text.replace(u'\n: ', u'\n:')
        new_text = new_text.replace(u'\n:: ', u'\n::')
        new_text = new_text.replace(u'\n::: ', u'\n:::')
        new_text = new_text.replace(u'\n:::: ', u'\n::::')
        new_text = new_text.replace(u'\n::::: ', u'\n:::::')
        new_text = re.sub(u'1er (janvier|février|avril|mars|mai|juin|juillet|août|septembre|octobre|novembre|décembre)', u'1{{er}} \\1', new_text)
        new_text = re.sub(u'([0-9])e ', u'\\1{{e}} ', new_text)
        #text = re.sub(u'([;:!?»]) <div>\n', u'\\1\n', new_text)

        # try to move the title inside the M&S
        match_title = re.search(u"{{[Jj]ournal[ ]*\|*(.*?)\|", new_text)
        if match_title:
            pos = re.search(u'==(.*?)==', new_text)
            if pos:
                new_text = new_text[0:pos.end(0)] + u'\n{{c|' + match_title.group(1) + u'|fs=140%}}\n\n\n' + new_text[pos.end(0):]

        safe_put(page,new_text,user+": match")
        jobs['number_of_split_job'] += 1
        # FIXME: can we pass the request here and use a callback in the js?
        # FIXME: server is None?
        jobs['split_queue'].put(maintitle, codelang, user, time.time(), None, None, None)
        # FIXME: that's an abuse of E_ERROR
        return ret_val(E_ERROR, "ok : transfert en cours.")

    prefix = prefix.decode('utf-8')
    p = re.compile("==__MATCH__:\[\[" + prefix + ":(.*?)/(\d+)(\|step=(\d+))?\]\]==")
    m = re.search(p,text)
    if m:
        djvuname = m.group(1)
        number = m.group(2)
        pos = text.find(m.group(0))
        head = text[:pos]
        text = text[pos+len(m.group(0)):]
        if m.group(4):
            try:
                step = int(m.group(4))
            except:
                return ret_val(E_ERROR, "match tag invalid")
        else:
            step = 1
    else:
        return ret_val(E_ERROR, "match tag not found")

    pywikibot.output(djvuname + " " + number + " " + str(step))
    try:
        number = int(number)
    except:
        return ret_val(E_ERROR, "illformed __MATCH__: no page number ?")

    cache = lifo_cache.LifoCache('match_and_split_text_layer')
    cached_text = align.get_djvu(cache, mysite, djvuname, True)
    if not cached_text:
        return ret_val(E_ERROR, "unable to read djvu, if the File: exists, please retry")

    data = align.do_match(text, cached_text, djvuname, number, verbose = False, prefix = prefix, step = step)
    if not data['error']:
        safe_put(page, head + data['text'], user + ": match")
        data['text'] = ""

    return data
Ejemplo n.º 20
0
        log(result)
        ret_code = '400 Bad Request'
        text = json.dumps({'error': 1, 'text': ret_code})

    start_response(ret_code,
                   [('Content-Type', 'application/json' + '; charset=UTF-8'),
                    ('Content-Length', str(len(text))),
                    ('Access-Control-Allow-Origin', '*')])
    return [text]


def myapp(environ, start_response):
    params = query_params(environ)

    if params['cmd'] == 'ping':
        return handle_ping(start_response)
    elif params['cmd'] == 'hocr':
        return handle_query(params, start_response)
    else:
        return handle_status(params, start_response)


if __name__ == "__main__":
    sys.stderr = open(os.path.expanduser('~/log/hocr_cgi.err'), 'a')

    from flup.server.cgi import WSGIServer
    try:
        WSGIServer(myapp).run()
    except BaseException:
        utils.print_traceback()
Ejemplo n.º 21
0
def do_match(mysite, maintitle, user, codelang):
    prefix = page_prefixes['wikisource'].get(codelang)
    if not prefix:
        return ret_val(E_ERROR, "no prefix")

    page = pywikibot.Page(mysite, maintitle)
    try:
        text = page.get()
    except:
        utils.print_traceback("failed to get page")
        return ret_val(E_ERROR, "failed to get page")

    if text.find("{{R2Mondes") != -1:
        global pl_dict
        pl_dict = {}
        p0 = re.compile("\{\{R2Mondes\|(\d+)\|(\d+)\|(\d+)\}\}\s*\n")
        try:
            new_text = p0.sub(repl, text)
        except pywikibot.NoPage:
            return ret_val(E_ERROR, "Erreur : impossible de trouver l'index")
        p = re.compile('==\[\[Page:([^=]+)\]\]==\n')

        cache = lifo_cache.LifoCache('match_and_split_text_layer')
        bl = p.split(new_text)
        for i in range(len(bl) / 2):
            title = bl[i * 2 + 1]
            content = bl[i * 2 + 2]
            filename, pagenum = title.split('/')
            if i == 0:
                cached_text = align.get_djvu(cache, mysite, filename, True)
            else:
                cached_text = align.get_djvu(cache, mysite, filename, False)
            if not cached_text:
                return ret_val(E_ERROR, "Erreur : fichier absent")
            if content.find("R2Mondes") != -1:
                p0 = re.compile("\{\{R2Mondes\|\d+\|\d+\|(\d+)\}\}\s*\n")
                bl0 = p0.split(text)
                title0 = bl0[i * 2 + 1].encode("utf8")
                return ret_val(
                    E_ERROR,
                    "Erreur : Syntaxe 'R2Mondes' incorrecte, dans la page " +
                    title0)
            r = align.match_page(content, cached_text[int(pagenum) - 1])
            print "%s %s  : %f" % (filename, pagenum, r)
            if r < 0.1:
                return ret_val(
                    E_ERROR,
                    "Erreur : Le texte ne correspond pas, page %s" % pagenum)
        #the page is ok
        new_text = re.sub(u'<references[ ]*/>', u'', new_text)
        new_text = re.sub(u'[ ]([,])', u'\\1', new_text)
        new_text = re.sub(u'([^.])[ ]([,.])', u'\\1\\2', new_text)
        new_text = re.sub(u'\.\.\.', u'…', new_text)

        new_text = re.sub(u'([^ \s])([;:!?])', u'\\1 \\2', new_text)
        new_text = re.sub(u'([«;:!?])([^ \s…])', u'\\1 \\2', new_text)
        # separated from the previous regexp else "word!»" overlap
        new_text = re.sub(u'([^ \s])([»])', u'\\1 \\2', new_text)

        # workaround some buggy text
        new_text = re.sub(u'([;:!?»]) \n', u'\\1\n', new_text)
        new_text = re.sub(u'([;:!?»])\'\'([ \n])', u'\\1\'\'\\2', new_text)
        # <&nbsp;><space>
        #new_text = re.sub(u'  ([;:!?»])', u' \\1', new_text)
        #new_text = re.sub(u' ([;:!?»])', u' \\1', new_text)
        new_text = re.sub(u'([;:!?»]) <br />', u'\\1<br />', new_text)
        new_text = new_text.replace(u'Page : ', u'Page:')
        new_text = new_text.replace(u'\n: ', u'\n:')
        new_text = new_text.replace(u'\n:: ', u'\n::')
        new_text = new_text.replace(u'\n::: ', u'\n:::')
        new_text = new_text.replace(u'\n:::: ', u'\n::::')
        new_text = new_text.replace(u'\n::::: ', u'\n:::::')
        new_text = re.sub(
            u'1er (janvier|février|avril|mars|mai|juin|juillet|août|septembre|octobre|novembre|décembre)',
            u'1{{er}} \\1', new_text)
        new_text = re.sub(u'([0-9])e ', u'\\1{{e}} ', new_text)
        #text = re.sub(u'([;:!?»]) <div>\n', u'\\1\n', new_text)

        # try to move the title inside the M&S
        match_title = re.search(u"{{[Jj]ournal[ ]*\|*(.*?)\|", new_text)
        if match_title:
            pos = re.search(u'==(.*?)==', new_text)
            if pos:
                new_text = new_text[
                    0:pos.end(0)] + u'\n{{c|' + match_title.group(
                        1) + u'|fs=140%}}\n\n\n' + new_text[pos.end(0):]

        safe_put(page, new_text, user + ": match")
        jobs['number_of_split_job'] += 1
        # FIXME: can we pass the request here and use a callback in the js?
        # FIXME: server is None?
        jobs['split_queue'].put(maintitle, codelang, user, time.time(), None,
                                None, None)
        # FIXME: that's an abuse of E_ERROR
        return ret_val(E_ERROR, "ok : transfert en cours.")

    prefix = prefix.decode('utf-8')
    p = re.compile("==__MATCH__:\[\[" + prefix +
                   ":(.*?)/(\d+)(\|step=(\d+))?\]\]==")
    m = re.search(p, text)
    if m:
        djvuname = m.group(1)
        number = m.group(2)
        pos = text.find(m.group(0))
        head = text[:pos]
        text = text[pos + len(m.group(0)):]
        if m.group(4):
            try:
                step = int(m.group(4))
            except:
                return ret_val(E_ERROR, "match tag invalid")
        else:
            step = 1
    else:
        return ret_val(E_ERROR, "match tag not found")

    pywikibot.output(djvuname + " " + number + " " + str(step))
    try:
        number = int(number)
    except:
        return ret_val(E_ERROR, "illformed __MATCH__: no page number ?")

    cache = lifo_cache.LifoCache('match_and_split_text_layer')
    cached_text = align.get_djvu(cache, mysite, djvuname, True)
    if not cached_text:
        return ret_val(
            E_ERROR, "unable to read djvu, if the File: exists, please retry")

    data = align.do_match(text,
                          cached_text,
                          djvuname,
                          number,
                          verbose=False,
                          prefix=prefix,
                          step=step)
    if not data['error']:
        safe_put(page, head + data['text'], user + ": match")
        data['text'] = ""

    return data