Esempio n. 1
0
def get_window_distance_with_selected_words(w1, w2, distance_measure,
                                            selected_words):
    """Given two windows, returns the value of the chosen distance_measure function from a fixed set of selected words
        Keyword arguments:
           w1 -- the window from which words are selected
           w2 -- window against which selected words from w1 are compared
           distance_measure -- distance function from preprocess_NLP_pkg

    """
    word_freq_w1 = preprocess_NLP_pkg.word_freq_count_normalised(w1)
    selected_word_vector_w1 = list(
        preprocess_NLP_pkg.select_feature_vector(word_freq_w1,
                                                 selected_words).values())
    word_freq_w2 = preprocess_NLP_pkg.word_freq_count_normalised(w2)
    selected_word_vector_w2 = list(
        preprocess_NLP_pkg.select_feature_vector(word_freq_w2,
                                                 selected_words).values())
    try:
        dist = distance_measure(selected_word_vector_w1,
                                selected_word_vector_w2)
        #print(dist)
        return dist
    except NameError:
        print(
            "The function ", distance_measure,
            " does not exist! Returning None. Distance Matrix will contain None values."
        )
        return None
def calculate_words_wfm(windows, selected_features):
    """Splits a text into windows of given size/ step size, converts windows into feature matrix against a given set of selected word features
        Keyword arguments:
            text -- the text to be converted into window feature matrix
            selected_features -- the features against for which the feature vector must be generated
    """
    window_feature_matrix = []
    for window in windows:
        word_freq = preprocess_NLP_pkg.word_freq_count_normalised(window)
        selected_word_freq = preprocess_NLP_pkg.select_feature_vector(word_freq, selected_features)
        selected_word_vector = list(selected_word_freq.values())
        window_feature_matrix.append(selected_word_vector)
    return np.asarray(window_feature_matrix)
def calculate_ngrams_wfm(windows, selected_features, n):
    """Splits a text into windows of given size/ step size, converts windows into feature matrix against a given set of selected character ngrams
        Keyword arguments:
            text -- the text to be converted into window feature matrix
            selected_features -- the features against for which the feature vector must be generated
            n -- n in ngrams
    """
    window_feature_matrix = []
    for window in windows:
        ngram_freq = preprocess_NLP_pkg.char_ngram_count_normalised(window, n=n)
        selected_ngram_freq = preprocess_NLP_pkg.select_feature_vector(ngram_freq, selected_features)
        selected_ngram_vector = list (selected_ngram_freq.values())
        window_feature_matrix.append(selected_ngram_vector)
    return np.asarray(window_feature_matrix)