def detect(and_or_word): ao_indices = [word_indices_in_group(and_or_word, group) for group in groups] for i, aos in enumerate(ao_indices): num_of_ao = len(aos) if num_of_ao >= 2: upper = and_or_word.upper() # print " %s-%s- found in #%d" % (upper, upper, i), aos cl = AndOr(and_or_word) # 区間が確定しているもの(=最後の1つ以外)をまず追加 for j in range(num_of_ao-1): this_et_idx = aos[j] next_et_idx = aos[j+1] cl.add(words[this_et_idx+1:next_et_idx]) # 最後の1つは、どこで終わるか確かめながら追加 last_et_idx = aos[num_of_ao-1] end_idx = groups[i][-1] + 1 ws = [] for idx in range(last_et_idx+1, end_idx): word = words[idx] first_item = word.items[0] if cl.pos == 'noun': # 格変化のある語に限る if first_item._ is None: break if non_genitive(first_item._): cases = [x[0] for x in first_item._] # これまでの物と格が一致する可能性がなければ排除 cases_x = filter(lambda case:case in cases, cl.cases) if not cases_x: break ws.append(word) idx += 1 if ws: cl.add(ws) else: pass # ERROR: type mismatch # print "CL: [%d..%d)" % (aos[0], idx) # cl.dump() for j in range(aos[0], idx): # words[j] = None # visited[j] = True visited.add(j) cl.restrict() words[aos[0]] = cl ao_loc.add(aos[0])
def detect_genitive_correspondances(words): M = len(words) targets = {} gen = [] blocks = {} for i in range(M): word = words[i] if isinstance(word, Predicate): pass #blocks[i] = word elif isinstance(word, AndOr): # targets[i] = word if not non_genitive(word._) and word._[0][0] == 'Gen': gen.append(i) elif isinstance(word, Word): if not word.items: blocks[i] = word continue first_item = word.items[0] if word.surface in (u'et', u'neque') or first_item.pos == 'preposition': blocks[i] = word continue if first_item.pos == 'adj' and first_item.attrib('base') == u'plēnus': targets[i] = word continue if first_item.pos != 'noun': continue targets[i] = word if not non_genitive(first_item._) and first_item._[0][0] == 'Gen': gen.append(i) else: # blocks[i] = word pass if not gen: return (words, []) # print "non-Gen:", [(ix, word.surface_utf8()) for ix,word in targets.items()] # print " Gen:", gen def find_target(gen_ix): def sub(fr, to, step): for i in range(fr, to, step): if blocks.has_key(i): return -1 if not targets.has_key(i): continue return i return -1 pre = sub(gen_ix-1, -1, -1) if pre >= 0: return pre post = sub(gen_ix+1, M, 1) if post >= 0: return post return -1 gen.sort(reverse=True) non_gen = set() for gen_ix in gen: print "// GEN#%d (%s)" % (gen_ix, words[gen_ix].surface_utf8()), target_ix = find_target(gen_ix) if target_ix >= 0: print "-> TARGET#%d (%s)" % (target_ix, words[target_ix].surface_utf8()) words[target_ix].add_genitive(words[gen_ix]) # words[gen_ix] = None else: print "-> no target noun detected" non_gen.add(gen_ix) words[gen_ix].restrict_cases(('Nom','Voc','Acc','Dat','Abl','Loc')) return (words, filter(lambda ix:ix not in non_gen, gen))