Example #1
0
    def detect(and_or_word):
        ao_indices = [word_indices_in_group(and_or_word, group) for group in groups]
        for i, aos in enumerate(ao_indices):
            num_of_ao = len(aos)
            if num_of_ao >= 2:
                upper = and_or_word.upper()
                # print "  %s-%s- found in #%d" % (upper, upper, i), aos
                cl = AndOr(and_or_word)
                # 区間が確定しているもの(=最後の1つ以外)をまず追加
                for j in range(num_of_ao-1):
                    this_et_idx = aos[j]
                    next_et_idx = aos[j+1]
                    cl.add(words[this_et_idx+1:next_et_idx])
                # 最後の1つは、どこで終わるか確かめながら追加
                last_et_idx = aos[num_of_ao-1]
                end_idx = groups[i][-1] + 1
                ws = []
                for idx in range(last_et_idx+1, end_idx):
                    word = words[idx]
                    first_item = word.items[0]
                    if cl.pos == 'noun':
                        # 格変化のある語に限る
                        if first_item._ is None: break
                        if non_genitive(first_item._):
                            cases = [x[0] for x in first_item._]
                            # これまでの物と格が一致する可能性がなければ排除
                            cases_x = filter(lambda case:case in cases, cl.cases)
                            if not cases_x: break
                    ws.append(word)
                    idx += 1
                if ws:
                    cl.add(ws)
                else:
                    pass # ERROR: type mismatch

                # print "CL: [%d..%d)" % (aos[0], idx)
                # cl.dump()
                for j in range(aos[0], idx):
                    # words[j] = None
                    # visited[j] = True
                    visited.add(j)

                cl.restrict()

                words[aos[0]] = cl
                ao_loc.add(aos[0])
Example #2
0
def detect_genitive_correspondances(words):
    M = len(words)
    targets = {}
    gen = []
    blocks = {}

    for i in range(M):
        word = words[i]
        if isinstance(word, Predicate):
            pass #blocks[i] = word
        elif isinstance(word, AndOr):
            # targets[i] = word
            if not non_genitive(word._) and word._[0][0] == 'Gen':
                gen.append(i)
        elif isinstance(word, Word):
            if not word.items:
                blocks[i] = word
                continue

            first_item = word.items[0]
            if word.surface in (u'et', u'neque') or first_item.pos == 'preposition':
                blocks[i] = word
                continue
            if first_item.pos == 'adj' and first_item.attrib('base') == u'plēnus':
                targets[i] = word
                continue

            if first_item.pos != 'noun':
                continue

            targets[i] = word
            if not non_genitive(first_item._) and first_item._[0][0] == 'Gen':
                gen.append(i)
        else:
            # blocks[i] = word
            pass

    if not gen: return (words, [])

#    print "non-Gen:", [(ix, word.surface_utf8()) for ix,word in targets.items()]
#    print "    Gen:", gen

    def find_target(gen_ix):
        def sub(fr, to, step):
            for i in range(fr, to, step):
                if blocks.has_key(i):
                    return -1
                if not targets.has_key(i):
                    continue
                return i
            return -1

        pre = sub(gen_ix-1, -1, -1)
        if pre >= 0: return pre

        post = sub(gen_ix+1, M, 1)
        if post >= 0: return post

        return -1

    gen.sort(reverse=True)

    non_gen = set()

    for gen_ix in gen:
        print "// GEN#%d (%s)" % (gen_ix, words[gen_ix].surface_utf8()),
        target_ix = find_target(gen_ix)
        if target_ix >= 0:
            print "-> TARGET#%d (%s)" % (target_ix, words[target_ix].surface_utf8())
            words[target_ix].add_genitive(words[gen_ix])
#            words[gen_ix] = None
        else:
            print "-> no target noun detected"
            non_gen.add(gen_ix)
            words[gen_ix].restrict_cases(('Nom','Voc','Acc','Dat','Abl','Loc'))

    return (words, filter(lambda ix:ix not in non_gen, gen))