def get_words(letters): # Index-pairs of letter with similar median stroke widths and similar heights # We use log2 for linear distance comparison in KDTree # (i.e. if log2(x) - log2(y) > 1, we know that x > 2*y) s_ix_letr_pairs = KDTree(np.log2(letters[:, 0:1])).query_pairs(1) h_ix_letr_pairs = KDTree(np.log2(letters[:, 1:2])).query_pairs(1) # Calc the angle (direction of text) for all letter-pairs which are # similar and close to each other pairs = [] for ix_letr1, ix_letr2 in h_ix_letr_pairs.intersection(s_ix_letr_pairs): diff = letters[ix_letr1, 3:5] - letters[ix_letr2, 3:5] # Distance between letters smaller than # 3 times the width of the wider letter dist = np.linalg.norm(diff) if dist < max(letters[ix_letr1, 2], letters[ix_letr2, 2]) * 3: angle = math.atan2(diff[0], diff[1]) angle += math.pi if angle < 0 else 0 pairs.append([ix_letr1, ix_letr2, angle]) pairs = np.asarray(pairs) # Pairs of letter-pairs with a similar angle (direction of text) a_ix_pair_pairs = KDTree(pairs[:, 2:3]).query_pairs(math.pi / 12) chains = [] for ix_pair_a, ix_pair_b in a_ix_pair_pairs: pair_a_letr1, pair_a_letr2 = int(pairs[ix_pair_a, 0]), int(pairs[ix_pair_a, 1]) pair_b_letr1, pair_b_letr2 = int(pairs[ix_pair_b, 0]), int(pairs[ix_pair_b, 1]) added = False for c in chains: if pair_a_letr1 in c: c.add(pair_a_letr2) added = True elif pair_a_letr2 in c: c.add(pair_a_letr1) added = True if not added: chains.append({pair_a_letr1, pair_a_letr2}) added = False for c in chains: if pair_b_letr1 in c: c.add(pair_b_letr2) added = True elif pair_b_letr2 in c: c.add(pair_b_letr1) added = True if not added: chains.append({pair_b_letr1, pair_b_letr2}) chains = np.asarray(chains) vecfunc = np.vectorize(len) chains = chains[vecfunc(chains) > 3] _, uniq_ix = np.unique(chains.astype(str), return_index=True) return chains[uniq_ix]
def _find_words(letr_inf): # swts, heights, widths, topleft_pts, images): # Index-pairs of letter with similar median stroke widths and similar heights # We use log2 for linear distance comparison in KDTree # (i.e. if log2(x) - log2(y) > 1, we know that x > 2*y) s_ix_letr_pairs = KDTree(np.log2(letr_inf[:, 0:1])).query_pairs(1) h_ix_letr_pairs = KDTree(np.log2(letr_inf[:, 1:2])).query_pairs(1) # Calc the angle (direction of text) for all letter-pairs which are # similar and close to each other pairs = [] for ix_letr1, ix_letr2 in h_ix_letr_pairs.intersection(s_ix_letr_pairs): diff = letr_inf[ix_letr1, 3:5] - letr_inf[ix_letr2, 3:5] # Distance between letters smaller than # 3 times the width of the wider letter dist = np.linalg.norm(diff) if dist < max(letr_inf[ix_letr1, 2], letr_inf[ix_letr2, 2]) * 3: angle = math.atan2(diff[0], diff[1]) angle += math.pi if angle < 0 else 0 pairs.append([ix_letr1, ix_letr2, angle]) pairs = np.asarray(pairs) # Pairs of letter-pairs with a similar angle (direction of text) a_ix_pair_pairs = KDTree(pairs[:, 2:3]).query_pairs(math.pi / 12) chains = [] for ix_pair_a, ix_pair_b in a_ix_pair_pairs: # Letter pairs [a] & [b] have a similar angle and each pair consists of # letter [1] & [2] which meet the similarity-requirements. pair_a_letr1, pair_a_letr2 = int(pairs[ix_pair_a, 0]), int(pairs[ix_pair_a, 1]) pair_b_letr1, pair_b_letr2 = int(pairs[ix_pair_b, 0]), int(pairs[ix_pair_b, 1]) # TODO: not correct? added = False for c in chains: if pair_a_letr1 in c: c.add(pair_a_letr2) added = True elif pair_a_letr2 in c: c.add(pair_a_letr1) added = True if not added: chains.append({pair_a_letr1, pair_a_letr2}) added = False for c in chains: if pair_b_letr1 in c: c.add(pair_b_letr2) added = True elif pair_b_letr2 in c: c.add(pair_b_letr1) added = True if not added: chains.append({pair_b_letr1, pair_b_letr2}) chains = np.asarray(chains) # List of sets of letters with possibly many duplicates # return chains # Single list of unique letters # return np.unique([int(ix) for chain in chains if len(chain) >= 3 for ix in chain]) vecfunc = np.vectorize(len) chains = chains[vecfunc(chains) > 3] _, uniq_ix = np.unique(chains.astype(str), return_index=True) return chains[uniq_ix]