def nucleus_find_cand(dict_aword, nucleus_threshold): dict_occ_cand = {} for shortshape, windows in dict_aword.items(): count_s1 = 0 #Meme mot schema et même CAND count_s2 = 0 #Meme mot schema et CAND differents count_s3 = 0 #Mot schema different et même CAND count_s4 = 0 #Mot schema different et CAND different for window in windows: linkword = ana_useful.which_linkword(window) cand = ana_useful.which_cand(window) for window1 in windows: if window1 != window: linkword1 = ana_useful.which_linkword(window1) cand1 = ana_useful.which_cand(window1) if linkword[1] == linkword1[1] and cand[2] == cand1[2]: count_s1 += 1 elif linkword[1] == linkword1[1] and cand[2] != cand1[2]: count_s2 += 1 elif linkword[1] != linkword1[1] and cand[2] == cand1[2]: count_s3 += 1 elif linkword[1] != linkword1[1] and cand[2] != cand1[2]: count_s4 += 1 if count_s1 / 2 >= nucleus_threshold[ 0] or count_s2 / 2 >= nucleus_threshold[ 1] or count_s3 / 2 >= nucleus_threshold[ 2] or count_s4 / 2 >= nucleus_threshold[3]: for window in windows: for occurrence in window: if occurrence[2] == 't': dict_occ_cand.setdefault(shortshape, []).append(occurrence) return dict_occ_cand
def nucleus_find_cand(dict_aword, nucleus_threshold): dict_occ_cand = {} for shortshape, windows in dict_aword.items(): count_s1 = 0 #Meme mot schema et même CAND count_s2 = 0 #Meme mot schema et CAND differents count_s3 = 0 #Mot schema different et même CAND count_s4 = 0 #Mot schema different et CAND different for window in windows: linkword = ana_useful.which_linkword(window) cand = ana_useful.which_cand(window) for window1 in windows: if window1 != window: linkword1 = ana_useful.which_linkword(window1) cand1 = ana_useful.which_cand(window1) if linkword[1] == linkword1[1] and cand[2] == cand1[2]: count_s1 += 1 elif linkword[1] == linkword1[1] and cand[2] != cand1[2]: count_s2 += 1 elif linkword[1] != linkword1[1] and cand[2] == cand1[2]: count_s3 += 1 elif linkword[1] != linkword1[1] and cand[2] != cand1[2]: count_s4 += 1 if count_s1/2 >= nucleus_threshold[0] or count_s2/2 >= nucleus_threshold[1] or count_s3/2 >= nucleus_threshold[2] or count_s4/2 >= nucleus_threshold[3]: for window in windows: for occurrence in window: if occurrence[2] == 't': dict_occ_cand.setdefault(shortshape, []).append(occurrence) return dict_occ_cand
def expression_valid_windows(windows, candidate): valid_window = [] valid_windows = [] buff = [] for window in windows: if (ana_useful.exists_linkword(window) == True and ana_useful.count_cand(window) == 2): if ana_useful.is_cand( window[-1]): #list[-1] returns last item of the list cand2 = ana_useful.which_cand([window[-1]]) if cand2[2] not in candidate.split( ): #to avoid building expression like "bâtiment de cet ensemble de bâtiments" -> "batiment de bâtiment" buff.append( window ) #dans ce cas la fenetre valide est de type (CAND1 + "aword" + CAND2) avec un mot de schéma quelque part. # in the buffer because we need to know if the aword in center is not the same 3 times or more (this case it would be better to build an expansion first, then an expression) else: short_window = ana_useful.cut_window(window, 2) #Puisqu'on a 2 CAND et que la fenetre fait 3 mots et que le dernier mot n'est pas un CAND alors la fenetre était de type CAND + CAND + mot quelconque if ana_useful.exists_linkword(short_window) == True: valid_windows.append( short_window ) #dans ce cas la fenetre valide est de type (CAND1 + CAND2) avec un mot de schéma entre eux . # check if the aword in center is not the same 3 times or more (this case it would be better to build an expansion first, then an expression) valid_windows.extend(not_expa_inside_expre(buff)) return valid_windows
def expression_valid_windows(windows, candidate): valid_window = [] valid_windows = [] buff = [] for window in windows: if (ana_useful.exists_linkword(window) == True and ana_useful.count_cand(window) == 2): if ana_useful.is_cand(window[-1]): #list[-1] returns last item of the list cand2 = ana_useful.which_cand([window[-1]]) if cand2[2] not in candidate.split(): #to avoid building expression like "bâtiment de cet ensemble de bâtiments" -> "batiment de bâtiment" buff.append(window) #dans ce cas la fenetre valide est de type (CAND1 + "aword" + CAND2) avec un mot de schéma quelque part. # in the buffer because we need to know if the aword in center is not the same 3 times or more (this case it would be better to build an expansion first, then an expression) else: short_window = ana_useful.cut_window(window, 2) #Puisqu'on a 2 CAND et que la fenetre fait 3 mots et que le dernier mot n'est pas un CAND alors la fenetre était de type CAND + CAND + mot quelconque if ana_useful.exists_linkword(short_window) == True: valid_windows.append(short_window) #dans ce cas la fenetre valide est de type (CAND1 + CAND2) avec un mot de schéma entre eux . # check if the aword in center is not the same 3 times or more (this case it would be better to build an expansion first, then an expression) valid_windows.extend(not_expa_inside_expre(buff)) return valid_windows