Exemple #1
0
def fvariant(fname,kind,gt=""):
    """Find the file variant corresponding to the given file name.
    Possible fil variants are line (or png), rseg, cseg, fst, costs, and txt.
    Ground truth files have an extra suffix (usually something like "gt",
    as in 010001.gt.txt or 010001.rseg.gt.png).  By default, the variant
    with the same ground truth suffix is produced.  The non-ground-truth
    version can be produced with gt="", the ground truth version can
    be produced with gt="gt" (or some other desired suffix)."""
    if gt!="": gt = "."+gt
    base,ext = allsplitext(fname)
    # text output
    if kind=="txt":
        return base+gt+".txt"
    assert gt=="","gt suffix may only be supplied for .txt files (%s,%s,%s)"%(fname,kind,gt)
    # a text line image
    if kind=="line" or kind=="png" or kind=="bin":
        return base+".bin.png"
    if kind=="nrm":
        return base+".nrm.png"
    # a recognition lattice
    if kind=="lattice":
        return base+gt+".lattice"
    # raw segmentation
    if kind=="rseg":
        return base+".rseg.png"
    # character segmentation
    if kind=="cseg":
        return base+".cseg.png"
    # text specifically aligned with cseg (this may be different from gt or txt)
    if kind=="aligned":
        return base+".aligned"
    # per character costs
    if kind=="costs":
        return base+".costs"
    raise BadInput("unknown kind: %s"%kind)
Exemple #2
0
def gt_implode(l):
    result = []
    for c in l:
        if c=="_":
            result.append("___")
        elif len(c)<=1:
            result.append(c)
        elif len(c)<=4:
            result.append("_"+c+"_")
        else:
            raise BadInput("cannot create ground truth transcription for: %s"%l)
    return "".join(result)
Exemple #3
0
def project_text(s, kind="exact"):
    """Project text onto a smaller subset of characters
    for comparison."""
    s = normalize_text(s)
    s = re.sub(ur'( *[.] *){4,}', u'....', s)  # dot rows
    s = re.sub(ur'[~_]', u'', s)  # dot rows
    if kind == "exact":
        return s
    if kind == "nospace":
        return re.sub(ur'\s', '', s)
    if kind == "spletdig":
        return re.sub(ur'[^A-Za-z0-9 ]', '', s)
    if kind == "letdig":
        return re.sub(ur'[^A-Za-z0-9]', '', s)
    if kind == "letters":
        return re.sub(ur'[^A-Za-z]', '', s)
    if kind == "digits":
        return re.sub(ur'[^0-9]', '', s)
    if kind == "lnc":
        s = s.upper()
        return re.sub(ur'[^A-Z]', '', s)
    raise BadInput("unknown normalization: " + kind)