def compare(c1,c2,similarity): from needleman_wunsch import align alignment = align(c1,c2,S=similarity) if alignment[0][0] == 'NA': return 'NA' c = [] for i in range(0,len(alignment[0])): if alignment[0][i][0] in ['[',']'] and alignment[0][i][0] != alignment[1][i][0]: return 'NA' if alignment[0][i][0] == alignment[1][i][0] in ['[',']'] and (alignment[0][i][1] != alignment[1][i][1]): c.append([alignment[0][i][0], 'XP']) elif alignment[0][i][1] == alignment[1][i][1]: if alignment[0][i][0] == alignment[1][i][0]: c.append(alignment[0][i]) else: c.append(['*', alignment[0][i][1]]) else: c.append(['*','*']) context = [c[0]] for i in range(1,len(c)): if c[i][1] != '*' or c[i-1][1] != '*': context.append(c[i]) open_nodes, closed_nodes = [], [] hasGap = False for word in context: if word[0] == '[': open_nodes.append(word[1]) if word[0] == ']': if len(open_nodes) == 0: return 'NA' elif open_nodes[len(open_nodes)-1] == word[1]: open_nodes = open_nodes[0:len(open_nodes)-1] else: return 'NA' if word[1] == 'VB____': hasGap = True if not hasGap or len(open_nodes)>1: return 'NA' return context
def match_kana(kanji, kana, return_score=False): def stoponzero(alignments): for i, alignment in enumerate(alignments): furigana, score = finalize_furigana(alignment, return_score=True) yield furigana, score if score == 0: break best_match, score = min(stoponzero(filter_alignments(align(kanji, kana))), key=lambda x: x[1]) return (best_match, score) if return_score else best_match
def approximate_split(readings, kana, max_distance): def similarity(a,b): if a==b: return 1 if a==' ' or b==' ': return -10 return -1 for result, score in readings: if score > max_distance: break q = kana w = ' '.join([y for _,y in result]) l, r = next(align(q, w, d=-1, fill=" ", s=similarity)) res = filter(None, l.split(' ')) res = list(zip([x for x,_ in result], res)) if len(res) == len(result): return res