def handle_status(params, start_response): default_limit = 50 max_limit = 1000 state_filter = params.get('filter', '') cmd_filter = params.get('cmd_filter', None) limit = get_int_param(params, 'limit', default_limit, max_limit) offset = get_int_param(params, 'offset', 0, None) #print >> sys.stderr, params db_obj = sge_jobs.DbJob() text = common_html.get_head('hocr', css = 'shared.css').encode('utf-8') + '\n <body>\n' html, jobs = job_table(db_obj, state_filter, limit, offset, default_limit, max_limit, cmd_filter) text += html text += accounting_table(db_obj, jobs, state_filter, limit, offset, default_limit, max_limit) text += ' </body>\n</html>' start_response('200 OK', [('Content-Type', 'text/html; charset=UTF-8'), ('Content-Length', str(len(text))), ('Access-Control-Allow-Origin', '*')]) return [ text ]
def do_status(queue): queue = queue.copy_items(True) html = common_html.get_head('Extract text layer') html += u"<body><div>The robot is running.<br/><hr/>" html += u"<br/>%d jobs in extract queue.<br/>" % len(queue) html += html_for_queue(queue) html += u'</div></body></html>' return html
def do_status(queue): queue = queue.copy_items(True) html = common_html.get_head(u'Verify match') html += u"<body><div>The robot is running.<br/><hr/>" html += u"<br/>%d jobs in verify match queue.<br/>" % len(queue) html += html_for_queue(queue) html += u'</div></body></html>' return html
def handle_status(params, start_response): text = common_html.get_head('modernization', css = 'shared.css').encode('utf-8') + '\n <body>\n' text += '<h1>OK</h1>' text += ' </body>\n</html>' return return_response(start_response, text, False, '200 OK', 'text/html')
def handle_status(params, start_response): text = common_html.get_head('pages without scan', css = 'shared.css').encode('utf-8') + '\n <body>\n' text += '<h1>OK</h1>' text += ' </body>\n</html>' return return_response(start_response, text, False, '200 OK', 'text/html')
def do_status(queue): queue = queue.copy_items(True) html = common_html.get_head(u'Dummy robot') html += u"<body><div>The robot is running.<br/><hr/>" html += u"<br/>%d jobs in dummy robot queue.<br/>" % len(queue) html += html_for_queue(queue) html += u'</div></body></html>' return html
def do_status(queue): queue = queue.copy_items(True) html = common_html.get_head('OCR service') html += '<body><div>The ocr robot is runnning.<br /><hr />' html += "%d jobs in queue.<br/>" % len(queue) html += html_for_queue(queue) html += '</div></body></html>' return html
def not_transcluded(domain, cursor): # set of Page: in cat 3/4 not transcluded from main query = """ SELECT page_title, page_id FROM categorylinks LEFT JOIN page ON page_id=cl_from WHERE cl_to in (%s, %s) AND page_title NOT IN (SELECT tl_title FROM templatelinks WHERE tl_namespace=%s AND tl_from_namespace=0); """ ns = urls[domain][0] cat3 = urls[domain][1] cat4 = urls[domain][2] cursor.execute(query, [cat3, cat4, ns]) print cursor.rowcount result = {} for x in range(cursor.rowcount): title, page_id = cursor.fetchone() title = title.split('/')[0] if title[-5:] in ['.djvu', '.pdf', '.tif']: result.setdefault(title, []) result[title].append(page_id) result = filter_result(result) if False: out_file = os.path.expanduser('~/tmp/transclusions/%s.txt' % domain) out_fd = open(out_file, 'w') for d in result: print >> out_fd, d[1], d[0] out_fd.close() out_file = os.path.expanduser('~/tmp/transclusions/%s.html' % domain) if os.path.exists(out_file): os.remove(out_file) out_fd = open(out_file, 'w') title = '%s.wikisource.org not transcluded page' % domain head = common_html.get_head(title, html5=True).encode('utf-8') print >> out_fd, head print >> out_fd, '<body>' if len(result): print >> out_fd, '<ol>' for d in result: print >> out_fd, format_html_line(domain, d[1], d[0]) print >> out_fd, '</ol>' else: "Empty result, no Index meet the criteria to be listed in this file." print >> out_fd, '\n</body>\n</html>' out_fd.close() return len(result)
def not_transcluded(domain, cursor): # set of Page: in cat 3/4 not transcluded from main query = """ SELECT page_title, page_id FROM categorylinks LEFT JOIN page ON page_id=cl_from WHERE cl_to in (%s, %s) AND page_title NOT IN (SELECT tl_title FROM templatelinks WHERE tl_namespace=%s AND tl_from_namespace=0); """ ns = urls[domain][0] cat3 = urls[domain][1] cat4 = urls[domain][2] cursor.execute(query, [ cat3, cat4, ns ]) print cursor.rowcount result = {} for x in range(cursor.rowcount): title, page_id = cursor.fetchone() title = title.split('/')[0] if title[-5:] in [ '.djvu', '.pdf', '.tif' ]: result.setdefault(title, []) result[title].append(page_id) result = filter_result(result) if False: out_file = os.path.expanduser('~/tmp/transclusions/%s.txt' % domain) out_fd = open(out_file, 'w') for d in result: print >> out_fd, d[1], d[0] out_fd.close() out_file = os.path.expanduser('~/tmp/transclusions/%s.html' % domain) if os.path.exists(out_file): os.remove(out_file) out_fd = open(out_file, 'w') title = '%s.wikisource.org not transcluded page' % domain head = common_html.get_head(title, html5 = True).encode('utf-8') print >> out_fd, head print >> out_fd, '<body>' if len(result): print >> out_fd, '<ol>' for d in result: print >> out_fd, format_html_line(domain, d[1], d[0]) print >> out_fd, '</ol>' else: "Empty result, no Index meet the criteria to be listed in this file." print >> out_fd, '\n</body>\n</html>' out_fd.close() return len(result)
def parse_global_dict(self, html): result = self.default_cache() html = common_html.get_head(u'TITLE') + u"\n<body>" + html + u'\n</body>\n</html>' root = etree.fromstring(html.encode('utf-8')) text = u'' for it in root.findall(".//{http://www.w3.org/1999/xhtml}li"): text += self.get_etree_text(it, set()) for line in text.split(u'\n'): match = re.match(u'^\s*(\S[^: ]*?)(?:\s| | | )*:\s*([\S].+?)\s*(?:\/\/.*?)?$', line, re.UNICODE) if match: result[match.group(1)] = match.group(2) return result
def do_status(): m_queue = jobs['match_queue'].copy_items(True) s_queue = jobs['split_queue'].copy_items(True) html = common_html.get_head('Match and split') html += u"<body><div>the robot is running.<br/><hr/>" html += u"<br/>%d jobs in match queue.<br/>" % len(m_queue) html += html_for_queue(m_queue) html += u"<br/>%d jobs in split queue.<br/>" % len(s_queue) html += html_for_queue(s_queue) html += u"<br/>%(number_of_match_job)d match, %(number_of_split_job)d split since server start<br/>" % jobs html += u'</div></body></html>' return html
def parse_global_dict(self, html): result = self.default_cache() html = common_html.get_head( u'TITLE') + u"\n<body>" + html + u'\n</body>\n</html>' root = etree.fromstring(html.encode('utf-8')) text = u'' for it in root.findall(".//{http://www.w3.org/1999/xhtml}li"): text += self.get_etree_text(it, set()) for line in text.split(u'\n'): match = re.match( u'^\s*(\S[^: ]*?)(?:\s| | | )*:\s*([\S].+?)\s*(?:\/\/.*?)?$', line, re.UNICODE) if match: result[match.group(1)] = match.group(2) return result
def handle_scan_query(params, start_response): text = common_html.get_head('pages without scan', css = 'shared.css').encode('utf-8') + '\n <body>\n' if params['lang']: try: offset = int(params.get('offset', 0)) limit = min(500, int(params.get('limit', 500))) lang = params['lang'] conn = db.create_conn(domain = lang, family = 'wikisource') cursor = db.use_db(conn, domain = lang, family = 'wikisource') ns = ws_category.domain_urls[lang][0] result = pages_without_scan(ns, cursor) result_len = len(result) result = result[offset:offset+limit] result = [( unicode(x[0], 'utf-8'), x[1]) for x in result] text += 'Total: ' + str(result_len) + '<br />' next_link = prev_next_link(False, result_len, lang, limit, offset) prev_link = prev_next_link(True, result_len, lang, limit, offset) text += prev_link + ' ' + next_link + '<br /><br />' for x in result: text += u'<a href="//%s.wikisource.org/wiki/%s">' % (lang, x[0]) + x[0].replace('_', ' ') + u'</a>, ' + str(x[1]) + u'<br />' text += u'<br />' + prev_link + ' ' + next_link cursor.close() conn.close() ret_code = '200 OK' except: utils.print_traceback() ret_code = '500 Internal Server Error' text = '<h1>' + ret_code + '</h1>' else: ret_code = '400 Bad Request' text = '<h1>' + ret_code + '</h1>' text += ' </body>\n</html>' return return_response(start_response, text.encode('utf-8'), False, ret_code, 'text/html')
def load_text(self, p, variant): filename = self.cache_dir + self.lang + '/' + str(p.latestRevision()) if not os.path.exists(filename): html = self.get_html(p) new_html = common_html.get_head(u'TITLE') + u"\n<body>" + html + u'\n</body>\n</html>' root = etree.fromstring(new_html.encode('utf-8')) exclude = set() html_id = self.config[variant]['modernize_div_id'] for it in root.findall(".//{http://www.w3.org/1999/xhtml}div[@id='%s']" % html_id): exclude.add(it) text = self.get_etree_text(root, exclude) for d in self.config[variant]['transform']: text = re.sub(d[0], d[1], text) utils.write_file(filename, text) else: text = utils.read_file(filename) return text
def suggest_dict(self, title): p = self.get_page(title) html = self.get_html(p) new_html = common_html.get_head( u'TITLE') + u"\n<body>" + html + u'\n</body>\n</html>' root = etree.fromstring(new_html.encode('utf-8')) exclude = set() for variant in self.variants: html_id = self.config[variant]['modernize_div_id'] for it in root.findall( ".//{http://www.w3.org/1999/xhtml}div[@id='%s']" % html_id): exclude.add(it) html_text = self.get_etree_text(root, exclude) # result = { # 'variant_name_1' : { # 'local_dict_used' : [(A, B), ... ], # 'suggest_local_dict' : { 'C' : 'D' ... }, # 'speller_suggest' : [ ( 'E', [ 'G', 'H', ]), ... ] # } # 'variant_name_2' : { ... } # } result = {} blacklist = self.load_blacklist() for variant in self.variants: speller = spell.Speller(self.config[variant]['aspell_lang']) cache = self.load_dicts(variant) if 'global_dict' in cache: global_dict = cache['global_dict'][1] else: global_dict = self.default_cache() other_local_dict = {} for key in cache: if key != 'global_dict': d = cache[key][1] for words in d: other_local_dict[words] = d[words] local_dict = self.parse_local_dict(variant, html) text = html_text for d in self.config[variant]['transform']: text = re.sub(d[0], d[1], text) # set of entry used in the local dict, a set because we want # to keep the order in local_dict so we don't store here the repl # string but we will iter the ordered local_dict and check # if a word is present in this set. used_local_dict = set() # map of entry used in all other local dict, good suggestion to # give to user suggest_local_dict = {} # all other words, these will be check spelled to provide an # additionnal set of suggestion word_seen = set() regex_split = re.compile(u'([' + self.word_chars + u']+)') words_list = regex_split.findall(text) i = 0 while True: if i >= len(words_list): break if words_list[i] in blacklist: i += 1 continue repl, glb, new_words, num = self.find_repl( words_list, i, local_dict, global_dict) if repl: if not glb: used_local_dict.add(new_words) else: # not found in global or local dict, try in all other # local dict to get suggestion. repl, glb, new_words, num = self.find_repl( words_list, i, other_local_dict, {}) if repl: # don't do any suggest for one letter if num > 1 or len(words_list[i]) > 1: suggest_local_dict[new_words] = repl if not repl: word_seen.add(words_list[i]) i += 1 else: i += num word_seen = [x for x in word_seen if not speller.check(x)] speller_suggest = [(x, speller.suggest(x)[:5]) for x in word_seen] # local dict is an ordered dict, so we can put words in the same # order as the local_dict, this allow better wiki diff when a local # dict is updated. local_dict_used = [(x, local_dict[x]) for x in local_dict if x in used_local_dict] # FIXME: for suggest_local_dict, must we remove suggested words # from other local dict but working word for the check speller? result[variant] = {} result[variant]['local_dict_used'] = local_dict_used result[variant]['suggest_local_dict'] = suggest_local_dict.items() result[variant]['speller_suggest'] = speller_suggest return result
def suggest_dict(self, title): p = self.get_page(title) html = self.get_html(p) new_html = common_html.get_head(u'TITLE') + u"\n<body>" + html + u'\n</body>\n</html>' root = etree.fromstring(new_html.encode('utf-8')) exclude = set() for variant in self.variants: html_id = self.config[variant]['modernize_div_id'] for it in root.findall(".//{http://www.w3.org/1999/xhtml}div[@id='%s']" % html_id): exclude.add(it) html_text = self.get_etree_text(root, exclude) # result = { # 'variant_name_1' : { # 'local_dict_used' : [(A, B), ... ], # 'suggest_local_dict' : { 'C' : 'D' ... }, # 'speller_suggest' : [ ( 'E', [ 'G', 'H', ]), ... ] # } # 'variant_name_2' : { ... } # } result = {} blacklist = self.load_blacklist() for variant in self.variants: speller = spell.Speller(self.config[variant]['aspell_lang']) cache = self.load_dicts(variant) if 'global_dict' in cache: global_dict = cache['global_dict'][1] else: global_dict = self.default_cache() other_local_dict = {} for key in cache: if key != 'global_dict': d = cache[key][1] for words in d: other_local_dict[words] = d[words] local_dict = self.parse_local_dict(variant, html) text = html_text for d in self.config[variant]['transform']: text = re.sub(d[0], d[1], text) # set of entry used in the local dict, a set because we want # to keep the order in local_dict so we don't store here the repl # string but we will iter the ordered local_dict and check # if a word is present in this set. used_local_dict = set() # map of entry used in all other local dict, good suggestion to # give to user suggest_local_dict = {} # all other words, these will be check spelled to provide an # additionnal set of suggestion word_seen = set() regex_split = re.compile(u'([' + self.word_chars + u']+)') words_list = regex_split.findall(text) i = 0 while True: if i >= len(words_list): break if words_list[i] in blacklist: i += 1 continue repl, glb, new_words, num = self.find_repl(words_list, i, local_dict, global_dict) if repl: if not glb: used_local_dict.add(new_words) else: # not found in global or local dict, try in all other # local dict to get suggestion. repl, glb, new_words, num = self.find_repl(words_list, i, other_local_dict, {}) if repl: # don't do any suggest for one letter if num > 1 or len(words_list[i]) > 1: suggest_local_dict[new_words] = repl if not repl: word_seen.add(words_list[i]) i += 1 else: i += num word_seen = [x for x in word_seen if not speller.check(x)] speller_suggest = [(x, speller.suggest(x)[:5]) for x in word_seen] # local dict is an ordered dict, so we can put words in the same # order as the local_dict, this allow better wiki diff when a local # dict is updated. local_dict_used = [ (x, local_dict[x]) for x in local_dict if x in used_local_dict ] # FIXME: for suggest_local_dict, must we remove suggested words # from other local dict but working word for the check speller? result[variant] = {} result[variant]['local_dict_used'] = local_dict_used result[variant]['suggest_local_dict'] = suggest_local_dict.items() result[variant]['speller_suggest'] = speller_suggest return result