def test_ec(self): logging.info("test_") line = '我们现今所使用的大部分舒学符号' # ,你们用的什么婊点符号 logging.info('input sentence is: %s', line) corrected_sent, correct_ranges = correct(line) logging.info('corrected_sent: %s', corrected_sent) logging.info('correct_ranges: %s', correct_ranges)
def eval_bcmi_data(data_path, verbose=False): sentence_size = 1 right_count = 0 right_result = dict() wrong_result = dict() with open(data_path, 'r', encoding='utf-8') as f: for line in f: line = line.strip() error_sentence, right_sentence = get_bcmi_corpus(line) if not error_sentence: continue pred_sentence, pred_detail = corrector.correct(error_sentence) if verbose: print('input sentence:', error_sentence) print('pred sentence:', pred_sentence) print('right sentence:', right_sentence) sentence_size += 1 if right_sentence == pred_sentence: right_count += 1 right_result[error_sentence] = [right_sentence, pred_sentence] else: wrong_result[error_sentence] = [right_sentence, pred_sentence] if verbose: print('right count:', right_count, ';sentence size:', sentence_size) return right_count / sentence_size, right_result, wrong_result
def run(image_path): image_path_ = image_path.rsplit('.', 1)[0] preprocess_image_path = image_path_ + '_bw.jpg' audio_path = image_path_ + '.wav' image = cv2.imread(image_path) image_bw = segmenter.preprocess(image) cv2.imwrite(preprocess_image_path, image_bw) lines = segmenter.segment_lines(image_bw) #classified_text = "රජගහා විහාරෆ හඤූතර තරඔක් කදූභැටියරී වර පසින් ඇහ් අ" classified_text = "" for i, line in enumerate(lines): character_images = segmenter.segment_line(line, i) for character_image in character_images: classified_text += classifier.classify(character_image) #classified_text += "2" #classified_text = "රජගහා විහාරෆ හඤූතර තරඔක් කදූභැටියරී වර පසින් ඇහ්" # remove extra spaces #classified_text = "" classified_text = classified_text.strip() classified_text = " ".join(classified_text.split()) #join modifiers classified_text = classifier.join_modifiers(classified_text) print classified_text corrected_words = corrector.correct(classified_text) corrected_text = "" for words in corrected_words: corrected_text += words[0].encode("utf-8") + " " print corrected_text synthesized_data = synthesizer.synthesize(corrected_text) audio_outfile = wave.open(audio_path, 'wb') audio_outfile.setparams(synthesized_data[0][0]) for i in range(0, len(synthesized_data), 1): audio_outfile.writeframes(synthesized_data[i][1]) # classified_text = "රජගහා විහාරෆ හඤූතර තරඔක් කදූභැටියරී වර පසින් ඇහ් අ" # corrected_words = [[1111111,2,"old1"],[3233333,4,"old2"]] return classified_text, corrected_words, image_path_, audio_path
def ocr(warp_id): ''' Input - { [crop: { [left: int] [right: int] [top: int] [bottom: int] }] } Output - content translated ''' img_path = get_warp_image_path(warp_id) img = cv2.imread(img_path, flags=cv2.IMREAD_GRAYSCALE) if img is None: fl.abort(404) json = request.get_json() try: if "crop" in json: crop_obj = json["crop"] left = none_or_int(crop_obj.get("left")) right = none_or_int(crop_obj.get("right")) top = none_or_int(crop_obj.get("top")) bottom = none_or_int(crop_obj.get("bottom")) img = img[top:bottom, left:right] except (KeyError, ValueError): fl.abort(400) with NamedTemporaryFile(suffix='.ppm') as f: cv2.imwrite(f.name, img) proc = Popen([os.path.join(CURRENT_DIRECTORY, "nhocr"), f.name, "-o", "-", "-block"], stdout=PIPE) buf = u"".join([line.decode("utf-8") for line in proc.stdout]) proc.wait() buf = u"\n".join(buf.splitlines()) content = correct(buf) print content.encode("utf-8") annotated, translated = ja_ko_translator(content) return fl.jsonify( content=annotated, translated=translated )
def eval_sighan_corpus(pkl_path, verbose=False): sighan_data = load_pkl(pkl_path) total_count = 1 right_count = 0 right_result = dict() wrong_result = dict() for error_sentence, right_detail in sighan_data: pred_sentence, pred_detail = corrector.correct(error_sentence) if verbose: print('input sentence:', error_sentence) print('pred sentence:', pred_sentence) for (right_loc, right_w, right_r) in right_detail: total_count += 1 # if right_r == pred_r: # right_count += 1 # right_result[error_sentence] = [right_r, pred_r] # else: # wrong_result[error_sentence] = [right_r, pred_r] if verbose: print('right: {} => {} , index: {}'.format( right_w, right_r, right_loc)) # if verbose: # print('right count:', right_count, ';total count:', total_count) return right_count / total_count, right_result, wrong_result
("xsel --version", 'You need to install xsel'), ("xdotool --help", 'You need to install xdotool'), ('xte -help > /dev/null', 'You need to install xautomation'), ("xvkbd -no-sync -no-repeat -xsendevent -text '\C' > /dev/null", 'You need to install xvkbd'), ) try: for command, error_text in commands: ret_code = call(command, shell=True) if ret_code != 0: raise OSError(error_text) except OSError as e: print('\n' + '!' * 30) print("Test failed:\n" + str(e)) print('!' * 30 + '\n') exit(1) if __name__ == '__main__': print('Keyboard layout corrector. \r\nTesting system environment...') check_env() print('Test example:') test_str = 'ghbdtn' corrected_str = correct(test_str) print('"%s" -> "%s"' % (test_str, corrected_str)) if corrected_str == 'привет': print('Test passed.')
#!/usr/bin/env python # -*- coding: utf-8 -*- from subprocess import call from corrector import correct __author__ = 'Odarchenko N.D.' if __name__ == '__main__': print('Keyboard layout corrector. \r\nTesting system environment...') commands = ( ("xsel --clear", 'You need to install xsel'), ('setxkbmap > /dev/null', 'You need to install setxkbmap'), ("xvkbd -xsendevent -text '\C'>/dev/null", 'You need to install xvkbd'), ) try: for command, error_text in commands: ret_code = call(command, shell=True) if ret_code != 0: raise OSError(error_text) except OSError as e: print('\n' + '!' * 30) print("Test failed:\n" + str(e)) print('!' * 30 + '\n') exit(1) print('Test example:') test = 'ghbdtn' print('"%s" -> "%s"' % (test, correct(test))) print('Test passed.')
# -*- coding: utf-8 -*- import numpy as np # import segmenter # import classifier import corrector import codecs # import synthesizer file = open('corrector/input.txt', 'r') input_text = file.read() # input_text = raw_text[3:] print input_text corrected_text = corrector.correct(input_text) print '%s' % ''.join([' , '.join('%s' % ' '.join(e) for e in corrected_text)]) # corrected_text = corrector.correct(classified_text) #print "corrected text : " + corrected_text # synthesized_voice = synthesizer.synthesize(corrected_text)
obj = json.loads(resp_payload) resultData = obj["resultData"] annotated = content hurigana = obj.get("hurigana", []) for item in hurigana: from_ = item["z"] to = item["h"] annotated = annotated.replace(from_, from_ + "(" + to + ")") return (annotated, resultData) return trans if __name__ == '__main__': trans = build_trans() src = sys.stdin.read().decode("utf-8") print "[Original]" print src.encode("UTF-8") src = correct(src) orig, result = trans(src) if result is None: sys.exit(1) result = result.encode("utf-8") print "[Translated]" sys.stdout.write(orig) sys.stdout.write("\n") sys.stdout.write(result)
def correct_content(self, content, language): # TODO to be moved to LT processes class # Segments and sends the content to LT according to the # public api rate limits # http://wiki.languagetool.org/public-http-api if os.path.isfile(self.outpath): msg = 'title exists in cache: %s'%self.title print(self.outpath) print(msg) logging.info(msg) with open(self.outpath) as f: responses = json.load(f) return responses else: responses = {'title': self.title, 'results': []} if self.online: per_req_size_limit = 6e3 # KB sentences = content.split('. ') requests = [] test_chunks = [] chunk = [] for sentence in sentences: chunk.append(sentence) total_chunk = '. '.join(chunk) if sys.getsizeof(total_chunk) > per_req_size_limit: requests.append(total_chunk) test_chunks.append((chunk[0], chunk[-1])) chunk = [] if chunk: # add last chunk requests.append('. '.join(chunk)) test_chunks.append((chunk[0], chunk[-1])) # send requests to api # TODO smarter rate limit control needed total_requests = len(requests) for i, request in enumerate(requests): try: response = api.check(request, api_url=self.languagetool, lang=language) # TODO check language, if confidence lower than 0.90 resend except Exception as e: msg = "%s language error. Trying to detect the language."\ ""%language logging.warning(msg) response = api.check(test_chunks[i][1], api_url=self.languagetool, lang=language) language_bottom = response['language']['detectedLanguage']['code'] response = api.check(test_chunks[i][0], api_url=self.languagetool, lang=language_bottom) language_top = response['language']['detectedLanguage']['code'] if language != language_top: language = language_top else: language = language_bottom msg = "%s detected as new language"%language logging.info(msg) response = api.check(request, api_url=self.languagetool, lang=language) message = '%i/%i response sent'%(i+1, total_requests) print(message) logging.info(message) if i+1 != total_requests: # wait at all except the last LT api call time.sleep(4) responses['results'].append({'content': request, 'response': response}) else: chunks = corrector.get_chunks(content) corrector.correct(chunks, responses) with open(self.outpath, 'w') as out: json.dump(responses, out, indent = 2) return responses
def correcttest(self): if not login.current_user.is_authenticated: return redirect(url_for('.login_view')) found = tests.find() if request.method == 'POST': title = request.form.get('title') tfound = tests.find_one({"TITLE": title}) # if request does not contain the file part if 'file' not in request.files: flash('No file was sent', category='danger') return redirect(request.url) file = request.files['file'] # if user does not select file, browser will # submit an empty part without filename if file.filename == '': flash('No file was selected', category='danger') return redirect(request.url) # if file was selected but of the wrong type if file and not self.allowed_file(file.filename): flash('Please select a .pdf file', category='danger') return redirect(request.url) # if file was selected & is correct type if file and self.allowed_file(file.filename): as_jpeg = PDF2jpg.convert(file) # fetches the answer key corresponding to the test key = self.getAnswerKey(tfound) print(key) # corrects the test image using the answer key # returns (location, score, correct, AMOUNT) loc, corr, am, sc, flag = corrector.correct(as_jpeg, key) curr_time = time.localtime() ctime = time.strftime('%a, %d %b %Y %H:%M:%S GMT', curr_time) corrected = { "TEST": title, "SCORE": sc, "CORRECT": corr, "AMOUNT": am, "FLAG": flag, "CREATED": ctime } # insert into the db result = results.insert_one(corrected) # obtain the MongoDB ObjectID in string form id = str(result.inserted_id) + '.png' # move and give a unique name to the test image for storage # destination = path to file destination = shutil.move(loc, 'results/' + id) # update the document with test file location results.update({'_id': result.inserted_id}, {"$set": { "HREF": id }}, upsert=False) print('NEW_FILE_SAVED={}'.format(destination)) flash("File was corrected. Visit 'Test Results' to see scores", category='success') return render_template('sb-admin/pages/uploadtest.html', tests=found, admin_view=self) self.header = "Correct Test" return render_template('sb-admin/pages/uploadtest.html', tests=found, admin_view=self)
def test_correct(self): response = {'title':'test'} correct(self.test_chunks, response) self.assertNotEqual(len(response['results']), 0)