Esempio n. 1
0
def get_words(letters):
    # Index-pairs of letter with similar median stroke widths and similar heights
    # We use log2 for linear distance comparison in KDTree
    # (i.e. if log2(x) - log2(y) > 1, we know that x > 2*y)
    s_ix_letr_pairs = KDTree(np.log2(letters[:, 0:1])).query_pairs(1)
    h_ix_letr_pairs = KDTree(np.log2(letters[:, 1:2])).query_pairs(1)

    # Calc the angle (direction of text) for all letter-pairs which are
    # similar and close to each other
    pairs = []
    for ix_letr1, ix_letr2 in h_ix_letr_pairs.intersection(s_ix_letr_pairs):
        diff = letters[ix_letr1, 3:5] - letters[ix_letr2, 3:5]
        # Distance between letters smaller than
        # 3 times the width of the wider letter
        dist = np.linalg.norm(diff)
        if dist < max(letters[ix_letr1, 2], letters[ix_letr2, 2]) * 3:
            angle = math.atan2(diff[0], diff[1])
            angle += math.pi if angle < 0 else 0
            pairs.append([ix_letr1, ix_letr2, angle])
    pairs = np.asarray(pairs)

    # Pairs of letter-pairs with a similar angle (direction of text)
    a_ix_pair_pairs = KDTree(pairs[:, 2:3]).query_pairs(math.pi / 12)

    chains = []
    for ix_pair_a, ix_pair_b in a_ix_pair_pairs:
        pair_a_letr1, pair_a_letr2 = int(pairs[ix_pair_a,
                                               0]), int(pairs[ix_pair_a, 1])
        pair_b_letr1, pair_b_letr2 = int(pairs[ix_pair_b,
                                               0]), int(pairs[ix_pair_b, 1])

        added = False
        for c in chains:
            if pair_a_letr1 in c:
                c.add(pair_a_letr2)
                added = True
            elif pair_a_letr2 in c:
                c.add(pair_a_letr1)
                added = True
        if not added:
            chains.append({pair_a_letr1, pair_a_letr2})
        added = False
        for c in chains:
            if pair_b_letr1 in c:
                c.add(pair_b_letr2)
                added = True
            elif pair_b_letr2 in c:
                c.add(pair_b_letr1)
                added = True
        if not added:
            chains.append({pair_b_letr1, pair_b_letr2})
    chains = np.asarray(chains)

    vecfunc = np.vectorize(len)
    chains = chains[vecfunc(chains) > 3]
    _, uniq_ix = np.unique(chains.astype(str), return_index=True)
    return chains[uniq_ix]
Esempio n. 2
0
def _find_words(letr_inf): # swts, heights, widths, topleft_pts, images):
    # Index-pairs of letter with similar median stroke widths and similar heights
    # We use log2 for linear distance comparison in KDTree
    # (i.e. if log2(x) - log2(y) > 1, we know that x > 2*y)
    s_ix_letr_pairs = KDTree(np.log2(letr_inf[:, 0:1])).query_pairs(1)
    h_ix_letr_pairs = KDTree(np.log2(letr_inf[:, 1:2])).query_pairs(1)

    # Calc the angle (direction of text) for all letter-pairs which are
    # similar and close to each other
    pairs = []
    for ix_letr1, ix_letr2 in h_ix_letr_pairs.intersection(s_ix_letr_pairs):
        diff = letr_inf[ix_letr1, 3:5] - letr_inf[ix_letr2, 3:5]
        # Distance between letters smaller than
        # 3 times the width of the wider letter
        dist = np.linalg.norm(diff)
        if dist < max(letr_inf[ix_letr1, 2], letr_inf[ix_letr2, 2]) * 3:
            angle = math.atan2(diff[0], diff[1])
            angle += math.pi if angle < 0 else 0
            pairs.append([ix_letr1, ix_letr2, angle])
    pairs = np.asarray(pairs)

    # Pairs of letter-pairs with a similar angle (direction of text)
    a_ix_pair_pairs = KDTree(pairs[:, 2:3]).query_pairs(math.pi / 12)

    chains = []
    for ix_pair_a, ix_pair_b in a_ix_pair_pairs:
        # Letter pairs [a] & [b] have a similar angle and each pair consists of
        # letter [1] & [2] which meet the similarity-requirements.
        pair_a_letr1, pair_a_letr2 = int(pairs[ix_pair_a, 0]), int(pairs[ix_pair_a, 1])
        pair_b_letr1, pair_b_letr2 = int(pairs[ix_pair_b, 0]), int(pairs[ix_pair_b, 1])

        # TODO: not correct?
        added = False
        for c in chains:
            if pair_a_letr1 in c:
                c.add(pair_a_letr2)
                added = True
            elif pair_a_letr2 in c:
                c.add(pair_a_letr1)
                added = True
        if not added:
            chains.append({pair_a_letr1, pair_a_letr2})
        added = False
        for c in chains:
            if pair_b_letr1 in c:
                c.add(pair_b_letr2)
                added = True
            elif pair_b_letr2 in c:
                c.add(pair_b_letr1)
                added = True
        if not added:
            chains.append({pair_b_letr1, pair_b_letr2})
    chains = np.asarray(chains)

    # List of sets of letters with possibly many duplicates
    # return chains
    # Single list of unique letters
    # return np.unique([int(ix) for chain in chains if len(chain) >= 3 for ix in chain])

    vecfunc = np.vectorize(len)
    chains = chains[vecfunc(chains) > 3]
    _, uniq_ix = np.unique(chains.astype(str), return_index=True)
    return chains[uniq_ix]