Ejemplo n.º 1
0
from root_based_finder import is_non_std, word_parts
from segment import Segmenter, combine_many_boxes
import shelve
import signal
import simplejson as json
from sklearn.externals import joblib
import sys
from termset import syllables

from tparser import parse_syllables
from utils import local_file
from viterbi_cython import viterbi_cython
# from viterbi_search import viterbi_search, word_bigram
import warnings

cls = load_cls('logistic-cls')

## Ignore warnings. THis is mostlu in response to incessant sklearn
## warnings about passing in 1d arrays
warnings.filterwarnings("ignore")
print 'ignoring all warnings'
###

rbfcls = load_cls('rbf-cls')
predict_log_proba = cls.predict_log_proba
predict_proba = cls.predict_proba

# Trained characters are labeled by number. Open the shelve that contains
# the mappings between the Unicode character and its number label.
allchars = shelve.open(local_file('allchars_dict2'))
char_to_dig = allchars['allchars']
Ejemplo n.º 2
0
# encoding: utf-8
'''Line breaking'''
from numpy import  array, float64, argmax, argmin, uint8, ones, floor, mean, std, where, argsort
import cv2 as cv
from utils import  check_for_overlap
from fast_utils import ftrim, fadd_padding
import sys
from bisect import bisect, bisect_right
from feature_extraction import normalize_and_extract_features
from classify import load_cls, label_chars

cls = load_cls('logistic-cls')

class LineCut(object):
    '''Line Cutting object - breaks lines in a page where lines are separated
    by empty whitespace
    
    Parameters:
    --------------------
    shapes: page_element object, (see page_elements.py)
    
    thresh_scale: float, default=.9995
        A threshold value for determining the breakline in the event that
        there is black pixel noise between lines. Should be set high to avoid
        setting line breaks through characters themselves. 
    
    Attributes:
    -----------
    lines_chars: list of lists, length=number of lines on page. Each sub-list
        contains the indices for the bounding boxes/contours assigned to
        its corresponding line.