Exemple #1
def handle_status(params, start_response):

    default_limit = 50
    max_limit = 1000

    state_filter = params.get('filter', '')
    cmd_filter = params.get('cmd_filter', None)
    limit = get_int_param(params, 'limit', default_limit, max_limit)
    offset = get_int_param(params, 'offset', 0, None)
    #print >> sys.stderr, params

    db_obj = sge_jobs.DbJob()

    text = common_html.get_head('hocr', css = 'shared.css').encode('utf-8') + '\n  <body>\n'

    html, jobs = job_table(db_obj, state_filter, limit, offset,
                           default_limit, max_limit, cmd_filter)
    text += html

    text += accounting_table(db_obj, jobs, state_filter, limit, offset,
                             default_limit, max_limit)

    text += '  </body>\n</html>'

    start_response('200 OK', [('Content-Type',
                               'text/html; charset=UTF-8'),
                              ('Content-Length', str(len(text))),
                              ('Access-Control-Allow-Origin', '*')])
    return [ text ]
def do_status(queue):
    queue = queue.copy_items(True)

    html = common_html.get_head('Extract text layer')
    html += u"<body><div>The robot is running.<br/><hr/>"
    html += u"<br/>%d jobs in extract queue.<br/>" % len(queue)
    html += html_for_queue(queue)
    html += u'</div></body></html>'
    return html
def do_status(queue):
    queue = queue.copy_items(True)

    html = common_html.get_head(u'Verify match')
    html += u"<body><div>The robot is running.<br/><hr/>"
    html += u"<br/>%d jobs in verify match queue.<br/>" % len(queue)
    html += html_for_queue(queue)
    html += u'</div></body></html>'
    return html
def handle_status(params, start_response):

    text = common_html.get_head('modernization', css = 'shared.css').encode('utf-8') + '\n  <body>\n'

    text += '<h1>OK</h1>'

    text += '  </body>\n</html>'

    return return_response(start_response, text, False, '200 OK', 'text/html')
Exemple #5
def handle_status(params, start_response):

    text = common_html.get_head('pages without scan', css = 'shared.css').encode('utf-8') + '\n  <body>\n'

    text += '<h1>OK</h1>'

    text += '  </body>\n</html>'

    return return_response(start_response, text, False, '200 OK', 'text/html')
Exemple #6
def do_status(queue):
    queue = queue.copy_items(True)

    html = common_html.get_head(u'Dummy robot')
    html += u"<body><div>The robot is running.<br/><hr/>"
    html += u"<br/>%d jobs in dummy robot queue.<br/>" % len(queue)
    html += html_for_queue(queue)
    html += u'</div></body></html>'
    return html
Exemple #7
def do_status(queue):
    queue = queue.copy_items(True)

    html = common_html.get_head('OCR service')
    html += '<body><div>The ocr robot is runnning.<br /><hr />'
    html += "%d jobs in queue.<br/>" % len(queue)
    html += html_for_queue(queue)
    html += '</div></body></html>'

    return html
Exemple #8
def do_status(queue):
    queue = queue.copy_items(True)

    html = common_html.get_head('OCR service')
    html += '<body><div>The ocr robot is runnning.<br /><hr />'
    html += "%d jobs in queue.<br/>" % len(queue)
    html += html_for_queue(queue)
    html += '</div></body></html>'

    return html
Exemple #9
def not_transcluded(domain, cursor):
    # set of Page: in cat 3/4 not transcluded from main
    query = """
SELECT page_title, page_id FROM categorylinks LEFT JOIN page ON page_id=cl_from
    WHERE cl_to in (%s, %s) AND page_title NOT IN
        (SELECT tl_title FROM templatelinks
            WHERE tl_namespace=%s AND tl_from_namespace=0);
    ns = urls[domain][0]
    cat3 = urls[domain][1]
    cat4 = urls[domain][2]
    cursor.execute(query, [cat3, cat4, ns])
    print cursor.rowcount
    result = {}
    for x in range(cursor.rowcount):
        title, page_id = cursor.fetchone()
        title = title.split('/')[0]
        if title[-5:] in ['.djvu', '.pdf', '.tif']:
            result.setdefault(title, [])

    result = filter_result(result)

    if False:
        out_file = os.path.expanduser('~/tmp/transclusions/%s.txt' % domain)
        out_fd = open(out_file, 'w')
        for d in result:
            print >> out_fd, d[1], d[0]

    out_file = os.path.expanduser('~/tmp/transclusions/%s.html' % domain)
    if os.path.exists(out_file):

    out_fd = open(out_file, 'w')

    title = '%s.wikisource.org not transcluded page' % domain
    head = common_html.get_head(title, html5=True).encode('utf-8')
    print >> out_fd, head
    print >> out_fd, '<body>'
    if len(result):
        print >> out_fd, '<ol>'

        for d in result:
            print >> out_fd, format_html_line(domain, d[1], d[0])

        print >> out_fd, '</ol>'
        "Empty result, no Index meet the criteria to be listed in this file."

    print >> out_fd, '\n</body>\n</html>'

    return len(result)
Exemple #10
def not_transcluded(domain, cursor):
    # set of Page: in cat 3/4 not transcluded from main
    query = """
SELECT page_title, page_id FROM categorylinks LEFT JOIN page ON page_id=cl_from
    WHERE cl_to in (%s, %s) AND page_title NOT IN
        (SELECT tl_title FROM templatelinks
            WHERE tl_namespace=%s AND tl_from_namespace=0);
    ns = urls[domain][0]
    cat3 = urls[domain][1]
    cat4 = urls[domain][2]
    cursor.execute(query, [ cat3, cat4, ns ])
    print cursor.rowcount
    result = {}
    for x in range(cursor.rowcount):
        title, page_id = cursor.fetchone()
        title = title.split('/')[0]
        if title[-5:] in [ '.djvu', '.pdf', '.tif' ]:
            result.setdefault(title, [])

    result = filter_result(result)

    if False:
        out_file = os.path.expanduser('~/tmp/transclusions/%s.txt' % domain)
        out_fd = open(out_file, 'w')
        for d in result:
            print >> out_fd, d[1], d[0]

    out_file = os.path.expanduser('~/tmp/transclusions/%s.html' % domain)
    if os.path.exists(out_file):

    out_fd = open(out_file, 'w')

    title = '%s.wikisource.org not transcluded page' % domain
    head = common_html.get_head(title, html5 = True).encode('utf-8')
    print >> out_fd, head
    print >> out_fd, '<body>'
    if len(result):
        print >> out_fd, '<ol>'

        for d in result:
            print >> out_fd, format_html_line(domain, d[1], d[0])

        print >> out_fd, '</ol>'
        "Empty result, no Index meet the criteria to be listed in this file."

    print >> out_fd, '\n</body>\n</html>'

    return len(result)
    def parse_global_dict(self, html):
        result = self.default_cache()

        html = common_html.get_head(u'TITLE') + u"\n<body>"  + html + u'\n</body>\n</html>'
        root = etree.fromstring(html.encode('utf-8'))
        text = u''
        for it in root.findall(".//{http://www.w3.org/1999/xhtml}li"):
            text += self.get_etree_text(it, set())

        for line in text.split(u'\n'):
            match = re.match(u'^\s*(\S[^: ]*?)(?:\s|&#160;|&nbsp;| )*:\s*([\S].+?)\s*(?:\/\/.*?)?$', line, re.UNICODE)
            if match:
                result[match.group(1)] = match.group(2)

        return result
Exemple #12
def do_status():
    m_queue = jobs['match_queue'].copy_items(True)
    s_queue = jobs['split_queue'].copy_items(True)

    html = common_html.get_head('Match and split')

    html += u"<body><div>the robot is running.<br/><hr/>"
    html += u"<br/>%d jobs in match queue.<br/>" % len(m_queue)
    html += html_for_queue(m_queue)
    html += u"<br/>%d jobs in split queue.<br/>" % len(s_queue)
    html += html_for_queue(s_queue)
    html += u"<br/>%(number_of_match_job)d match, %(number_of_split_job)d split since server start<br/>" % jobs
    html += u'</div></body></html>'

    return html
Exemple #13
def do_status():
    m_queue = jobs['match_queue'].copy_items(True)
    s_queue = jobs['split_queue'].copy_items(True)

    html = common_html.get_head('Match and split')

    html += u"<body><div>the robot is running.<br/><hr/>"
    html += u"<br/>%d jobs in match queue.<br/>" % len(m_queue)
    html += html_for_queue(m_queue)
    html += u"<br/>%d jobs in split queue.<br/>" % len(s_queue)
    html += html_for_queue(s_queue)
    html += u"<br/>%(number_of_match_job)d match, %(number_of_split_job)d split since server start<br/>" % jobs
    html += u'</div></body></html>'

    return html
Exemple #14
    def parse_global_dict(self, html):
        result = self.default_cache()

        html = common_html.get_head(
            u'TITLE') + u"\n<body>" + html + u'\n</body>\n</html>'
        root = etree.fromstring(html.encode('utf-8'))
        text = u''
        for it in root.findall(".//{http://www.w3.org/1999/xhtml}li"):
            text += self.get_etree_text(it, set())

        for line in text.split(u'\n'):
            match = re.match(
                u'^\s*(\S[^: ]*?)(?:\s|&#160;|&nbsp;| )*:\s*([\S].+?)\s*(?:\/\/.*?)?$',
                line, re.UNICODE)
            if match:
                result[match.group(1)] = match.group(2)

        return result
Exemple #15
def handle_scan_query(params, start_response):
    text = common_html.get_head('pages without scan', css = 'shared.css').encode('utf-8') + '\n  <body>\n'

    if params['lang']:
            offset = int(params.get('offset', 0))
            limit = min(500, int(params.get('limit', 500)))
            lang = params['lang']
            conn = db.create_conn(domain = lang, family = 'wikisource')
            cursor = db.use_db(conn, domain = lang, family = 'wikisource')
            ns = ws_category.domain_urls[lang][0]
            result = pages_without_scan(ns, cursor)
            result_len = len(result)
            result = result[offset:offset+limit]
            result = [( unicode(x[0], 'utf-8'), x[1]) for x in result]
            text += 'Total: ' + str(result_len) + '<br />'
            next_link = prev_next_link(False, result_len, lang, limit, offset)
            prev_link = prev_next_link(True, result_len, lang, limit, offset)
            text += prev_link + '&#160;' + next_link + '<br /><br />'

            for x in result:
                text += u'<a href="//%s.wikisource.org/wiki/%s">' % (lang, x[0]) +  x[0].replace('_', ' ') + u'</a>, ' + str(x[1]) + u'<br />'

            text += u'<br />' + prev_link + '&#160;' + next_link
            ret_code = '200 OK'
            ret_code = '500 Internal Server Error'
            text = '<h1>' + ret_code + '</h1>'
        ret_code = '400 Bad Request'
        text = '<h1>' + ret_code + '</h1>'

    text += '  </body>\n</html>'

    return return_response(start_response, text.encode('utf-8'), False, ret_code, 'text/html')
    def load_text(self, p, variant):
        filename = self.cache_dir + self.lang + '/' + str(p.latestRevision())

        if not os.path.exists(filename):
            html = self.get_html(p)
            new_html = common_html.get_head(u'TITLE') + u"\n<body>"  + html + u'\n</body>\n</html>'

            root = etree.fromstring(new_html.encode('utf-8'))
            exclude = set()
            html_id = self.config[variant]['modernize_div_id']

            for it in root.findall(".//{http://www.w3.org/1999/xhtml}div[@id='%s']" % html_id):

            text = self.get_etree_text(root, exclude)
            for d in self.config[variant]['transform']:
                text = re.sub(d[0], d[1], text)

            utils.write_file(filename, text)
            text = utils.read_file(filename)

        return text
Exemple #17
    def suggest_dict(self, title):
        p = self.get_page(title)
        html = self.get_html(p)

        new_html = common_html.get_head(
            u'TITLE') + u"\n<body>" + html + u'\n</body>\n</html>'
        root = etree.fromstring(new_html.encode('utf-8'))

        exclude = set()

        for variant in self.variants:
            html_id = self.config[variant]['modernize_div_id']

            for it in root.findall(
                    ".//{http://www.w3.org/1999/xhtml}div[@id='%s']" %

        html_text = self.get_etree_text(root, exclude)

        # result = {
        # 'variant_name_1' : {
        #    'local_dict_used' : [(A, B), ... ],
        #    'suggest_local_dict' : { 'C' : 'D' ... },
        #    'speller_suggest' : [ ( 'E', [ 'G', 'H', ]), ... ]
        #    }
        # 'variant_name_2' : { ... }
        # }
        result = {}

        blacklist = self.load_blacklist()

        for variant in self.variants:
            speller = spell.Speller(self.config[variant]['aspell_lang'])
            cache = self.load_dicts(variant)
            if 'global_dict' in cache:
                global_dict = cache['global_dict'][1]
                global_dict = self.default_cache()

            other_local_dict = {}
            for key in cache:
                if key != 'global_dict':
                    d = cache[key][1]
                    for words in d:
                        other_local_dict[words] = d[words]

            local_dict = self.parse_local_dict(variant, html)

            text = html_text

            for d in self.config[variant]['transform']:
                text = re.sub(d[0], d[1], text)

            # set of entry used in the local dict, a set because we want
            # to keep the order in local_dict so we don't store here the repl
            # string but we will iter the ordered local_dict and check
            # if a word is present in this set.
            used_local_dict = set()
            # map of entry used in all other local dict, good suggestion to
            # give to user
            suggest_local_dict = {}
            # all other words, these will be check spelled to provide an
            # additionnal set of suggestion
            word_seen = set()

            regex_split = re.compile(u'([' + self.word_chars + u']+)')
            words_list = regex_split.findall(text)
            i = 0
            while True:
                if i >= len(words_list):

                if words_list[i] in blacklist:
                    i += 1

                repl, glb, new_words, num = self.find_repl(
                    words_list, i, local_dict, global_dict)

                if repl:
                    if not glb:
                    # not found in global or local dict, try in all other
                    # local dict to get suggestion.
                    repl, glb, new_words, num = self.find_repl(
                        words_list, i, other_local_dict, {})
                    if repl:
                        # don't do any suggest for one letter
                        if num > 1 or len(words_list[i]) > 1:
                            suggest_local_dict[new_words] = repl

                if not repl:
                    i += 1
                    i += num

            word_seen = [x for x in word_seen if not speller.check(x)]
            speller_suggest = [(x, speller.suggest(x)[:5]) for x in word_seen]

            # local dict is an ordered dict, so we can put words in the same
            # order as the local_dict, this allow better wiki diff when a local
            # dict is updated.
            local_dict_used = [(x, local_dict[x]) for x in local_dict
                               if x in used_local_dict]

            # FIXME: for suggest_local_dict, must we remove suggested words
            # from other local dict but working word for the check speller?

            result[variant] = {}
            result[variant]['local_dict_used'] = local_dict_used
            result[variant]['suggest_local_dict'] = suggest_local_dict.items()
            result[variant]['speller_suggest'] = speller_suggest

        return result
    def suggest_dict(self, title):
        p = self.get_page(title)
        html = self.get_html(p)

        new_html = common_html.get_head(u'TITLE') + u"\n<body>"  + html + u'\n</body>\n</html>'
        root = etree.fromstring(new_html.encode('utf-8'))

        exclude = set()

        for variant in self.variants:
            html_id = self.config[variant]['modernize_div_id']

            for it in root.findall(".//{http://www.w3.org/1999/xhtml}div[@id='%s']" % html_id):

        html_text = self.get_etree_text(root, exclude)

        # result = {
        # 'variant_name_1' : {
        #    'local_dict_used' : [(A, B), ... ],
        #    'suggest_local_dict' : { 'C' : 'D' ... },
        #    'speller_suggest' : [ ( 'E', [ 'G', 'H', ]), ... ]
        #    }
        # 'variant_name_2' : { ... }
        # }
        result = {}

        blacklist = self.load_blacklist()

        for variant in self.variants:
            speller = spell.Speller(self.config[variant]['aspell_lang'])
            cache = self.load_dicts(variant)
            if 'global_dict' in cache:
                global_dict = cache['global_dict'][1]
                global_dict = self.default_cache()

            other_local_dict = {}
            for key in cache:
                if key != 'global_dict':
                    d = cache[key][1]
                    for words in d:
                        other_local_dict[words] = d[words]

            local_dict = self.parse_local_dict(variant, html)

            text = html_text

            for d in self.config[variant]['transform']:
                text = re.sub(d[0], d[1], text)

            # set of entry used in the local dict, a set because we want
            # to keep the order in local_dict so we don't store here the repl
            # string but we will iter the ordered local_dict and check
            # if a word is present in this set.
            used_local_dict = set()
            # map of entry used in all other local dict, good suggestion to
            # give to user
            suggest_local_dict = {}
            # all other words, these will be check spelled to provide an
            # additionnal set of suggestion
            word_seen = set()

            regex_split = re.compile(u'([' + self.word_chars + u']+)')
            words_list = regex_split.findall(text)
            i = 0
            while True:
                if i >= len(words_list):

                if words_list[i] in blacklist:
                    i += 1

                repl, glb, new_words, num = self.find_repl(words_list, i,

                if repl:
                    if not glb:
                    # not found in global or local dict, try in all other
                    # local dict to get suggestion.
                    repl, glb, new_words, num = self.find_repl(words_list, i,
                    if repl:
                        # don't do any suggest for one letter
                        if num > 1 or len(words_list[i]) > 1:
                            suggest_local_dict[new_words] = repl

                if not repl:
                    i += 1
                    i += num

            word_seen = [x for x in word_seen if not speller.check(x)]
            speller_suggest = [(x, speller.suggest(x)[:5]) for x in word_seen]

            # local dict is an ordered dict, so we can put words in the same
            # order as the local_dict, this allow better wiki diff when a local
            # dict is updated.
            local_dict_used = [ (x, local_dict[x]) for x in local_dict if x in used_local_dict ]

            # FIXME: for suggest_local_dict, must we remove suggested words
            # from other local dict but working word for the check speller?

            result[variant] = {}
            result[variant]['local_dict_used'] = local_dict_used
            result[variant]['suggest_local_dict'] = suggest_local_dict.items()
            result[variant]['speller_suggest'] = speller_suggest

        return result