def translate(text): """Attempt to translate text using food_words and then the CEDICT dictionary.""" start = dt.datetime.now() words = find_words(text) start = time_elapsed("Find words", start) results = check_words(words) start = time_elapsed("Check words", start) return results
def search(text): """Takes a string and returns dish information or translation""" # timing information, can delete later start = datetime.datetime.now() # Returns search data for a particular query. results = search_dish_name(text) time_elapsed("Search and translate", start) return json.dumps(results)
def preprocess_image(path): """Do standard preprocessing on image before sending to Tesseract.""" start = dt.datetime.now() # Steps using PIL im = Image.open(path) im = smooth_and_grayscale(im) start = time_elapsed("Smoothing", start) im = binarize(im) start = time_elapsed("Binarization", start) im.save(path)
def upload(): """Handler for uploading an image, processing and sending to Tesseract.""" if request.data: file = request.data now = datetime.datetime.utcnow() # Create a unique filename for the image. filename = now.strftime('%Y%m%d%M%S') + '.png' image_path = os.path.join(app.config['UPLOAD_FOLDER'], filename) # write the image data to the file with open(image_path, 'wb') as f: f.write(file) # timing information for performance, can delete later start = datetime.datetime.now() start = time_elapsed("Writing file", start) # do some preprocessing on the image to optimize it for Tesseract preprocess_image(image_path) start = time_elapsed("Preprocessing", start) # run the image through tesseract and extract text text = image_file_to_string(image_path, lang="chi_sim", graceful_errors=True) text = text.strip() start = time_elapsed("Tesseract", start) if not text: # if Tesseract returns nothing, do some additional processing to see if results improve. smooth_and_thin_image(image_path) # run through tesseract again text = image_file_to_string(image_path, lang=LANG, graceful_errors=True) text = text.strip() start = time_elapsed("Tesseract", start) if not text: # if still no text from Tesseract, send an error to the client error_data = {"error": "No results found. Please try again."} return json.dumps(error_data) # if text was received, redirect to search return redirect(url_for("search", text=text))
def search_dish_name(text): """Searches for text in the dishes database. If not found, translates text and looks for similar dishes in database. Returns JSON data for dish or search results.""" # timing information, can delete later. start = dt.datetime.now() results = {} if type(text) != unicode: text = text.decode('utf-8') if len(text) > 10: # Most dish names are 3-5 characters. # If Tesseract returned more than 10 characters, something probably went wrong. print "Input text is too long." return None else: # Find a matching dish, if it exists. match = Dish.find_match(text) if match: # If result is found, return JSON representation of dish. results = match.get_json() start = time_elapsed("Dish lookup", start) else: # If no dish is found, return translation data and similar dishes, if they exist. translation = translate(text) start = time_elapsed("Translation", start) results['translation'] = translation # Find similar dishes and add to results. if len(text) > 1: similar_dishes = Dish.find_similar(text) start = time_elapsed("Similar dish lookup", start) similar_json = [] for similar_dish in similar_dishes: dish_data = similar_dish.get_json_min() similar_json.append(dish_data) if similar_json != []: results['similar'] = similar_json return results
def smooth_and_thin_image(path): """Extra preprocessing, including smoothing, acute angle emphasis and thinning.""" start = dt.datetime.now() # Steps using PIL im = Image.open(path) # Steps using scikit-image pix = im_to_trutharray(im) # Stentiford preprocessing for image thinning. smooth_and_emphasize_angles(pix) start = time_elapsed("Stentiford preprocessing", start) # Thinning pix = thin_image(pix) start = time_elapsed("Thinning", start) im = trutharray_to_im(pix) im.save(path)