Beispiel #1
0
def expression_valid_windows(windows, candidate):
    valid_window = []
    valid_windows = []
    buff = []

    for window in windows:
        if (ana_useful.exists_linkword(window) == True
                and ana_useful.count_cand(window) == 2):
            if ana_useful.is_cand(
                    window[-1]):  #list[-1] returns last item of the list
                cand2 = ana_useful.which_cand([window[-1]])
                if cand2[2] not in candidate.split(
                ):  #to avoid building expression like "bâtiment de cet ensemble de bâtiments" -> "batiment de bâtiment"
                    buff.append(
                        window
                    )  #dans ce cas la fenetre valide est de type (CAND1 + "aword" + CAND2) avec un mot de schéma quelque part.
                # in the buffer because we need to know if the aword in center is not the same 3 times or more (this case it would be better to build an expansion first, then an expression)
            else:
                short_window = ana_useful.cut_window(window, 2)
                #Puisqu'on a 2 CAND et que la fenetre fait 3 mots et que le dernier mot n'est pas un CAND alors la fenetre était de type CAND + CAND + mot quelconque
                if ana_useful.exists_linkword(short_window) == True:
                    valid_windows.append(
                        short_window
                    )  #dans ce cas la fenetre valide est de type (CAND1 + CAND2) avec un mot de schéma entre eux .

    # check if the aword in center is not the same 3 times or more (this case it would be better to build an expansion first, then an expression)
    valid_windows.extend(not_expa_inside_expre(buff))
    return valid_windows
Beispiel #2
0
def nucleus_valid_window(window):
    if ana_useful.exists_linkword(window):
        for occurrence in window:
            index_cand = 0
            if ana_useful.is_cand(occurrence):
                index_cand = window.index(occurrence)
                break
        right_window = window[index_cand:]
        if ana_useful.count_cand(right_window) < 2 and ana_useful.exists_linkword(right_window):
            return right_window
Beispiel #3
0
def nucleus_valid_window(window):
    if ana_useful.exists_linkword(window):
        for occurrence in window:
            index_cand = 0
            if ana_useful.is_cand(occurrence):
                index_cand = window.index(occurrence)
                break
        right_window = window[index_cand:]
        if ana_useful.count_cand(
                right_window) < 2 and ana_useful.exists_linkword(right_window):
            return right_window
Beispiel #4
0
def expression_find_cand(valid_windows, expression_threshold):
    shortshape_list = []
    dict_cand_windows = {}
    i = 0
    for window in valid_windows:
        shortshape = ''
        #créer une shortshape pour chaque fenetre. une shortshape est 'CANDCAND'
        #apriori toutes les shortshapes commenceront par le même cand (celui en argument de la fonction `recherche_expression`)
        for occurrence in window:
            if ana_useful.is_cand(occurrence):
                shortshape += occurrence[2]
        shortshape_list.append(shortshape) # l'ordre des shortshapes dans shortshape_list conserve l'ordre des fenetres in valid_windows

    for shortshape in shortshape_list:
        occ_count = shortshape_list.count(shortshape)
        if occ_count >= expression_threshold:
            dict_cand_windows.setdefault(shortshape,[]).append(valid_windows[i])
        i += 1
    return dict_cand_windows
Beispiel #5
0
def expansion_valid_window(windows):
    valid_windows = []
    for window in windows:
        for occurrence in window:
            if ana_useful.is_cand(occurrence):
                pos_cand = window.index(occurrence)
        left_window = window[:pos_cand + 1]
        right_window = window[pos_cand:]

        exists_linkword_R = ana_useful.exists_linkword(right_window)
        exists_linkword_L = ana_useful.exists_linkword(left_window)

        clean_window = ana_useful.window_wo_fword(window)
        #Les expansions ne doivent pas contenir de mot de schéma
        # Le CAND est forcément en position 2 par construction et suppression des mots v
        if clean_window[2][2] == 't' and not exists_linkword_R:
            valid_windows.append(window[pos_cand:])  #RightWindow
        if clean_window[0][2] == 't' and not exists_linkword_L:
            valid_windows.append(window[:pos_cand + 1])  #LeftWindow
    return valid_windows
Beispiel #6
0
def expansion_valid_window(windows):
    valid_windows = []
    for window in windows:
        for occurrence in window:
            if ana_useful.is_cand(occurrence):
                pos_cand = window.index(occurrence)
        left_window = window[:pos_cand+1]
        right_window = window[pos_cand:]

        exists_linkword_R = ana_useful.exists_linkword(right_window)
        exists_linkword_L = ana_useful.exists_linkword(left_window)

        clean_window = ana_useful.window_wo_fword(window)
        #Les expansions ne doivent pas contenir de mot de schéma
        # Le CAND est forcément en position 2 par construction et suppression des mots v
        if clean_window[2][2] == 't' and not exists_linkword_R:
            valid_windows.append(window[pos_cand:])#RightWindow
        if clean_window[0][2] == 't' and not exists_linkword_L:
            valid_windows.append(window[:pos_cand+1])#LeftWindow
    return valid_windows
Beispiel #7
0
def expression_valid_windows(windows, candidate):
    valid_window = []
    valid_windows = []
    buff = []

    for window in windows:
        if (ana_useful.exists_linkword(window) == True and ana_useful.count_cand(window) == 2):
            if ana_useful.is_cand(window[-1]): #list[-1] returns last item of the list
                cand2 = ana_useful.which_cand([window[-1]])
                if cand2[2] not in candidate.split(): #to avoid building expression like "bâtiment de cet ensemble de bâtiments" -> "batiment de bâtiment"
                    buff.append(window) #dans ce cas la fenetre valide est de type (CAND1 + "aword" + CAND2) avec un mot de schéma quelque part.
                # in the buffer because we need to know if the aword in center is not the same 3 times or more (this case it would be better to build an expansion first, then an expression)
            else:
                short_window = ana_useful.cut_window(window, 2)
                #Puisqu'on a 2 CAND et que la fenetre fait 3 mots et que le dernier mot n'est pas un CAND alors la fenetre était de type CAND + CAND + mot quelconque
                if ana_useful.exists_linkword(short_window) == True:
                    valid_windows.append(short_window) #dans ce cas la fenetre valide est de type (CAND1 + CAND2) avec un mot de schéma entre eux .

    # check if the aword in center is not the same 3 times or more (this case it would be better to build an expansion first, then an expression)
    valid_windows.extend(not_expa_inside_expre(buff))
    return valid_windows
Beispiel #8
0
def expression_find_cand(valid_windows, expression_threshold):
    shortshape_list = []
    dict_cand_windows = {}
    i = 0
    for window in valid_windows:
        shortshape = ''
        #créer une shortshape pour chaque fenetre. une shortshape est 'CANDCAND'
        #apriori toutes les shortshapes commenceront par le même cand (celui en argument de la fonction `recherche_expression`)
        for occurrence in window:
            if ana_useful.is_cand(occurrence):
                shortshape += occurrence[2]
        shortshape_list.append(
            shortshape
        )  # l'ordre des shortshapes dans shortshape_list conserve l'ordre des fenetres in valid_windows

    for shortshape in shortshape_list:
        occ_count = shortshape_list.count(shortshape)
        if occ_count >= expression_threshold:
            dict_cand_windows.setdefault(shortshape,
                                         []).append(valid_windows[i])
        i += 1
    return dict_cand_windows