def expression_valid_windows(windows, candidate): valid_window = [] valid_windows = [] buff = [] for window in windows: if (ana_useful.exists_linkword(window) == True and ana_useful.count_cand(window) == 2): if ana_useful.is_cand( window[-1]): #list[-1] returns last item of the list cand2 = ana_useful.which_cand([window[-1]]) if cand2[2] not in candidate.split( ): #to avoid building expression like "bâtiment de cet ensemble de bâtiments" -> "batiment de bâtiment" buff.append( window ) #dans ce cas la fenetre valide est de type (CAND1 + "aword" + CAND2) avec un mot de schéma quelque part. # in the buffer because we need to know if the aword in center is not the same 3 times or more (this case it would be better to build an expansion first, then an expression) else: short_window = ana_useful.cut_window(window, 2) #Puisqu'on a 2 CAND et que la fenetre fait 3 mots et que le dernier mot n'est pas un CAND alors la fenetre était de type CAND + CAND + mot quelconque if ana_useful.exists_linkword(short_window) == True: valid_windows.append( short_window ) #dans ce cas la fenetre valide est de type (CAND1 + CAND2) avec un mot de schéma entre eux . # check if the aword in center is not the same 3 times or more (this case it would be better to build an expansion first, then an expression) valid_windows.extend(not_expa_inside_expre(buff)) return valid_windows
def nucleus_valid_window(window): if ana_useful.exists_linkword(window): for occurrence in window: index_cand = 0 if ana_useful.is_cand(occurrence): index_cand = window.index(occurrence) break right_window = window[index_cand:] if ana_useful.count_cand(right_window) < 2 and ana_useful.exists_linkword(right_window): return right_window
def nucleus_valid_window(window): if ana_useful.exists_linkword(window): for occurrence in window: index_cand = 0 if ana_useful.is_cand(occurrence): index_cand = window.index(occurrence) break right_window = window[index_cand:] if ana_useful.count_cand( right_window) < 2 and ana_useful.exists_linkword(right_window): return right_window
def expression_find_cand(valid_windows, expression_threshold): shortshape_list = [] dict_cand_windows = {} i = 0 for window in valid_windows: shortshape = '' #créer une shortshape pour chaque fenetre. une shortshape est 'CANDCAND' #apriori toutes les shortshapes commenceront par le même cand (celui en argument de la fonction `recherche_expression`) for occurrence in window: if ana_useful.is_cand(occurrence): shortshape += occurrence[2] shortshape_list.append(shortshape) # l'ordre des shortshapes dans shortshape_list conserve l'ordre des fenetres in valid_windows for shortshape in shortshape_list: occ_count = shortshape_list.count(shortshape) if occ_count >= expression_threshold: dict_cand_windows.setdefault(shortshape,[]).append(valid_windows[i]) i += 1 return dict_cand_windows
def expansion_valid_window(windows): valid_windows = [] for window in windows: for occurrence in window: if ana_useful.is_cand(occurrence): pos_cand = window.index(occurrence) left_window = window[:pos_cand + 1] right_window = window[pos_cand:] exists_linkword_R = ana_useful.exists_linkword(right_window) exists_linkword_L = ana_useful.exists_linkword(left_window) clean_window = ana_useful.window_wo_fword(window) #Les expansions ne doivent pas contenir de mot de schéma # Le CAND est forcément en position 2 par construction et suppression des mots v if clean_window[2][2] == 't' and not exists_linkword_R: valid_windows.append(window[pos_cand:]) #RightWindow if clean_window[0][2] == 't' and not exists_linkword_L: valid_windows.append(window[:pos_cand + 1]) #LeftWindow return valid_windows
def expansion_valid_window(windows): valid_windows = [] for window in windows: for occurrence in window: if ana_useful.is_cand(occurrence): pos_cand = window.index(occurrence) left_window = window[:pos_cand+1] right_window = window[pos_cand:] exists_linkword_R = ana_useful.exists_linkword(right_window) exists_linkword_L = ana_useful.exists_linkword(left_window) clean_window = ana_useful.window_wo_fword(window) #Les expansions ne doivent pas contenir de mot de schéma # Le CAND est forcément en position 2 par construction et suppression des mots v if clean_window[2][2] == 't' and not exists_linkword_R: valid_windows.append(window[pos_cand:])#RightWindow if clean_window[0][2] == 't' and not exists_linkword_L: valid_windows.append(window[:pos_cand+1])#LeftWindow return valid_windows
def expression_valid_windows(windows, candidate): valid_window = [] valid_windows = [] buff = [] for window in windows: if (ana_useful.exists_linkword(window) == True and ana_useful.count_cand(window) == 2): if ana_useful.is_cand(window[-1]): #list[-1] returns last item of the list cand2 = ana_useful.which_cand([window[-1]]) if cand2[2] not in candidate.split(): #to avoid building expression like "bâtiment de cet ensemble de bâtiments" -> "batiment de bâtiment" buff.append(window) #dans ce cas la fenetre valide est de type (CAND1 + "aword" + CAND2) avec un mot de schéma quelque part. # in the buffer because we need to know if the aword in center is not the same 3 times or more (this case it would be better to build an expansion first, then an expression) else: short_window = ana_useful.cut_window(window, 2) #Puisqu'on a 2 CAND et que la fenetre fait 3 mots et que le dernier mot n'est pas un CAND alors la fenetre était de type CAND + CAND + mot quelconque if ana_useful.exists_linkword(short_window) == True: valid_windows.append(short_window) #dans ce cas la fenetre valide est de type (CAND1 + CAND2) avec un mot de schéma entre eux . # check if the aword in center is not the same 3 times or more (this case it would be better to build an expansion first, then an expression) valid_windows.extend(not_expa_inside_expre(buff)) return valid_windows
def expression_find_cand(valid_windows, expression_threshold): shortshape_list = [] dict_cand_windows = {} i = 0 for window in valid_windows: shortshape = '' #créer une shortshape pour chaque fenetre. une shortshape est 'CANDCAND' #apriori toutes les shortshapes commenceront par le même cand (celui en argument de la fonction `recherche_expression`) for occurrence in window: if ana_useful.is_cand(occurrence): shortshape += occurrence[2] shortshape_list.append( shortshape ) # l'ordre des shortshapes dans shortshape_list conserve l'ordre des fenetres in valid_windows for shortshape in shortshape_list: occ_count = shortshape_list.count(shortshape) if occ_count >= expression_threshold: dict_cand_windows.setdefault(shortshape, []).append(valid_windows[i]) i += 1 return dict_cand_windows