Ejemplo n.º 1
0
def compare(c1,c2,similarity):
    from needleman_wunsch import align
    alignment = align(c1,c2,S=similarity)
    if alignment[0][0] == 'NA': return 'NA'
    
    c = []
    for i in range(0,len(alignment[0])):
        if alignment[0][i][0] in ['[',']'] and alignment[0][i][0] != alignment[1][i][0]:
            return 'NA'
        if alignment[0][i][0] == alignment[1][i][0] in ['[',']'] and (alignment[0][i][1] != alignment[1][i][1]): 
            c.append([alignment[0][i][0], 'XP'])
        elif alignment[0][i][1] == alignment[1][i][1]:
            if alignment[0][i][0] == alignment[1][i][0]:
                c.append(alignment[0][i])
            else: c.append(['*', alignment[0][i][1]])
        else: c.append(['*','*'])
    
    context = [c[0]]
    for i in range(1,len(c)):
        if c[i][1] != '*' or c[i-1][1] != '*': context.append(c[i])
    open_nodes, closed_nodes = [], []
    hasGap = False
    for word in context:
        if word[0] == '[': open_nodes.append(word[1])
        if word[0] == ']':
            if len(open_nodes) == 0: return 'NA'
            elif open_nodes[len(open_nodes)-1] == word[1]:
                open_nodes = open_nodes[0:len(open_nodes)-1]
            else: return 'NA'
        if word[1] == 'VB____': hasGap = True
    if not hasGap or len(open_nodes)>1: return 'NA'
    return context
Ejemplo n.º 2
0
def match_kana(kanji, kana, return_score=False):
    def stoponzero(alignments):
        for i, alignment in enumerate(alignments):
            furigana, score = finalize_furigana(alignment, return_score=True)
            yield furigana, score

            if score == 0:
                break

    best_match, score = min(stoponzero(filter_alignments(align(kanji, kana))),
                            key=lambda x: x[1])
    return (best_match, score) if return_score else best_match
Ejemplo n.º 3
0
def approximate_split(readings, kana, max_distance):
    def similarity(a,b):
        if a==b:
            return 1
        if a==' ' or b==' ':
            return -10
        return -1

    for result, score in readings:
        if score > max_distance:
            break

        q = kana
        w = ' '.join([y for _,y in result])

        l, r = next(align(q, w, d=-1, fill=" ", s=similarity))

        res = filter(None, l.split(' '))
        res = list(zip([x for x,_ in result], res))
        if len(res) == len(result):
            return res