Example #1
0
def query_expansion(query: List[str], n_top_word: int = 10) -> List[str]:
    """
    Expands a given query based on the word in the query
    Specifically for each word we find the frequency of word before and after
    and add n_top_word most frequent word to the query.
    :param query: List of words
    :param n_top_word: number of frequent word you want to add.
    :return: expanded query
    """
    documents = utility.load_vector_file("Generated Files/doc2word.csv")
    result = []
    for word in query:
        expanded_query = {}
        # append original word to query
        document_ids = [
            ids for ids, values in documents.items() if word in values
        ]
        for new_id_doc in document_ids:
            # add window size neighboring words
            document = documents[new_id_doc]
            word_index = document.index(word)
            if word_index == 0:
                before_word = document[word_index]
                expanded_query[before_word] = expanded_query.get(
                    before_word, 0) + 1
            elif word_index == len(document) - 1:
                after_word = document[word_index]
                expanded_query[after_word] = expanded_query.get(after_word,
                                                                0) + 1
            else:
                before_word = document[word_index - 1]
                after_word = document[word_index + 1]
                expanded_query[before_word] = expanded_query.get(
                    before_word, 0) + 1
                expanded_query[after_word] = expanded_query.get(after_word,
                                                                0) + 1
        sorted_query_words = list(
            dict(
                sorted(expanded_query.items(),
                       key=lambda x: x[1],
                       reverse=True)).keys())
        result.append(sorted_query_words[:n_top_word])
    result.append(query)
    return list(set(itertools.chain.from_iterable(result)))
Example #2
0
def cut_off_words(corpus,
                  word_maximum_doc_percent,
                  word_minimum_count,
                  use_tfidf: bool = False):
    nltk.download('stopwords')
    stop_words = stopwords.words('danish')
    stop_words.extend(
        list(utility.load_vector_file("word_datasets/stopwords.csv").values()))
    if not use_tfidf:
        cv = CountVectorizer(max_df=word_maximum_doc_percent,
                             min_df=word_minimum_count,
                             stop_words=stop_words)
        cv_matrix = cv.fit_transform(corpus)
        words = key_dictionizer(cv.get_feature_names())
        return words
    else:
        cv = CountVectorizer(stop_words=stop_words)
        cv_matrix = cv.fit_transform(corpus)
        words = key_dictionizer(cv.get_feature_names())
        words = {v: k for k, v in words.items()}
        words = filter_tfidf(cv_matrix, words)

        return words
Example #3
0
def query_run_with_expansion():
    """
    A running example of the expanded query and yield the topic distribution
    for each query generated.
    :return: 
    """
    vectorizer = sp.load_npz("generated_files/tfidf_matrix.npz")
    words = utility.load_vector_file("generated_files/word2vec.csv")
    dictionary = Dictionary([words.values()])
    queries = generate_document_queries(vectorizer, words, 10, 4)
    expanded_queries = []

    for query in tqdm(list(queries.items())):
        expanded_queries.append(query_expansion(query, 5))

    lda = load_lda("LDA/model/document_model(83, None, 0.001)")
    topic_distributions = []
    for exp_query in expanded_queries:
        topic_distributions.append(
            get_document_topics_from_model(exp_query, lda, dictionary,
                                           lda.num_topics))

    for query, topic_dis in zip(expanded_queries, topic_distributions):
        print(f"Topic distribution: {topic_dis}")
Example #4
0
def preprocess(
        filename_or_docs="2017_data.json",
        word_save_filename="generated_files/word2vec.csv",
        doc_save_filename="generated_files/doc2vec.csv",
        doc_word_save_filename="generated_files/doc2word.csv",
        doc_word_matrix_save_filename="generated_files/count_vec_matrix.npz",
        word_minimum_count=20,
        word_maximum_doc_percent=0.25,
        doc_minimum_length=20,
        save=True,
        word_check=True):
    """
    preprocesses a json file into a docword count vectorization matrix, removing unhelpful words and documents.
    :param filename_or_docs: path of .json file to load (default: "documents.json") or the documents to preprocess
    :param word_save_filename: path of .csv file to save words in vector format. Only relevant if save=True.
    (default: "generated_files/word2vec.csv")
    :param doc_save_filename: path of .csv file to save documents in vector format. Only relevant if save=True.
    (default: "generated_files/doc2vec.csv")
    :param doc_word_save_filename: path of .csv file to map documents and contained words using ids.
    (default: "generated_files/doc2word.csv")
    :param doc_word_matrix_save_filename: path of the .npz file which contains the Count Vectorization matrix.
    (default: "generated_files/count_vec_matrix.npz")
    :param word_minimum_count: minimum amount a word must be used in the document set to be considered viable
    (default: 20).
    :param word_maximum_doc_percent: maximum percentage of documents that may contain a word for it to be considered
    viable (default: 0.25)
    :param doc_minimum_length: minimum amount of words for a document to be viable (default: 20).
    :param save: boolean indicating whether to save words and document files.
    :param word_check: boolean indicating whether to check words against word databases.
    Can be very slow when using new dataset, but is saved locally afterwards.
    :return: csr-matrix (sparse matrix) containing word frequencies for each document.
    """
    print('Beginning Preprocessing Procedure.')
    step = 1
    # load documents file
    print(f'Step {step}: loading documents.')
    # If filename_or_docs is a string, load documents from path, else continue as if given documents directly
    document_ids = load_document_file(filename_or_docs) if isinstance(
        filename_or_docs, str) else filename_or_docs
    # filter documents and create document set
    document_ids, documents = filter_documents(document_ids,
                                               doc_minimum_length)

    # cut off words that are used too often or too little (max/min document frequency) or are stop words
    step += 1
    print(f'Step {step}: stop words and word frequency.')
    # Filter words based on rarity
    vocab = gensim.corpora.Dictionary(documents)
    vocab.filter_extremes(word_minimum_count, word_maximum_doc_percent)
    # Get stopwords
    nltk.download('stopwords')
    stop_words = stopwords.words('danish')
    stop_words.extend(
        list(utility.load_vector_file("word_datasets/stopwords.csv").values()))
    # Remove stopwords
    words = vocab.token2id
    bad_ids = []
    for sw in stop_words:
        if sw in words:
            bad_ids.append(words[sw])
    vocab.filter_tokens(bad_ids=bad_ids)
    vocab.compactify()

    if word_check:
        # cut off words that are not used in danish word databases or are wrong word type
        step += 1
        print(f"Step {step}: word databases and POS-tagging.")
        words = vocab.token2id
        bad_ids = word_checker(words)
        for sw in stop_words:
            if sw in words:
                bad_ids.append(words[sw])
        vocab.filter_tokens(bad_ids=bad_ids)
        vocab.compactify()

    # Stemming to combine word declensions
    step += 1
    print(f"Step {step}: Apply Stemming / Lemming")
    words = list(vocab.token2id.keys())
    vocab, documents = stem_lem(words, documents, stem_or_lem=False)

    for id, x in enumerate(documents):
        test = vocab.doc2idx(x)
        documents[id] = [x[i] for i in range(len(x)) if test[i] != -1]

    # transform documents into a matrix containing counts for each word in each document
    step += 1
    print(f"Step {step}: doc-word matrix construction")
    words = list(vocab.token2id.keys())
    cv = CountVectorizer(vocabulary=words)
    cv_matrix = cv.fit_transform([' '.join(x) for x in documents])
    print("Matrix is: " + str(cv_matrix.shape))

    if save:
        step += 1
        print(f'Step {step}: saving files.')
        utility.save_vector_file(word_save_filename, words)
        utility.save_vector_file(doc_save_filename, document_ids.keys())
        utility.save_vector_file(doc_word_save_filename,
                                 [' '.join(x) for x in documents])
        vocab.save("generated_files/vocab")
        sparse.save_npz(doc_word_matrix_save_filename, cv_matrix)
    print('Finished Preprocessing Procedure.')
    return cv_matrix, vocab, documents
Example #5
0
def stem_lem(words, documents, stem_or_lem: bool = False):
    """
    Updates a word list and a corpus to use stemmed words.
    :param stem_or_lem: bool indicating whether to apply stemming or lemmatizer. True is stem, False is lem.
    :param corpus: a list of sentences (strings of words separated by spaces)
    :param words: a list of words
    :return: new corpus and words list, were all words have been replaced by stemmed/lemmetized versions.
    """
    stop_words = stopwords.words('danish')
    stop_words.extend(
        list(utility.load_vector_file("word_datasets/stopwords.csv").values()))
    if stem_or_lem:
        # Stemming
        stemmer = DanishStemmer()
        # Update word list to use stemmed words
        translator = {}
        add = []
        remove = []
        for word in tqdm(words):
            stem = stemmer.stem(word)
            if stem != word:
                if word not in remove:
                    remove.append(word)
                if stem not in add and stem not in stop_words:
                    add.append(stem)
                if word not in translator and stem not in stop_words:
                    translator[word] = stem
        words = [x for x in words if x not in remove]
        words.extend([x for x in add if x not in words])
    else:
        lemmer = lemmy.load("da")
        # build up dictionary that translates old words into their new versions
        translator = {}
        add = []
        remove = []
        for word in tqdm(words):
            lem = lemmer.lemmatize("", word)
            other = [x for x in lem if x != word]
            if len(other) > 0:
                if word not in lem and word not in remove:
                    remove.append(word)
                # add all lem options if they are not stopwords
                add.extend(
                    [x for x in lem if x not in stop_words and x not in add])
                if word not in translator and lem not in stop_words:
                    lem = " ".join(lem)
                    translator[word] = lem
        words = [x for x in words if x not in remove]
        words.extend([x for x in add if x not in words])

    # update corpus to use stemmed words
    for x in tqdm(range(len(documents))):
        sentence = documents[x]
        for i in range(len(sentence)):
            word = sentence[i]
            if word in translator:
                sentence[i] = translator[word]
        sentence = ' '.join(sentence)
        sentence = sentence.split(' ')
        documents[x] = sentence

    diction = gensim.corpora.Dictionary(documents)
    d_words = diction.token2id
    good_ids = [d_words[x] for x in words]
    diction.filter_tokens(good_ids=good_ids)
    diction.compactify()

    return diction, documents
Example #6
0
import random
from typing import Dict, List

import numpy as np
import scipy.sparse as sp
from gensim.corpora import Dictionary
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import normalize
from tqdm import tqdm

import preprocessing
import utility
from models import lda
from models.lda import get_document_topics_from_model, load_lda

doc2word = utility.load_vector_file("generated_files/doc2word.csv")
word2vec = utility.load_vector_file("generated_files/word2vec.csv")
inverse_w2v = {v: k for k, v in word2vec.items()}


def evaluate_query_doc(function, query: List[str], document_index: int):
    """
    Evaluate a query based on a function and document index
    :param function: the evaluation function
    :param query: the list of query words
    :param document_index: the index of the document
    :return: the product of the evaluate function
    """
    p_wd = []
    for word in query:
        word_index = inverse_w2v[word]
Example #7
0
import numpy as np
import scipy.sparse as sp
from rank_bm25 import BM25Okapi
from sklearn.preprocessing import normalize
from tqdm import tqdm

import preprocessing
import query_handling
import utility

cv_matrix = sp.load_npz("Generated Files/count_vec_matrix.npz")
dt_matrix = sp.load_npz("Generated Files/(30, 0.1, 0.1)topic_doc_matrix.npz")
tw_matrix = sp.load_npz("Generated Files/(30, 0.1, 0.1)topic_word_matrix.npz")
wordfreq = np.array(cv_matrix.sum(axis=0))[0]
doc2word = utility.load_vector_file("Generated Files/doc2word.csv")
word2vec = utility.load_vector_file("Generated Files/word2vec.csv")
dirichlet_smoothing = sum([len(i) for i in list(doc2word.values())]) / len(doc2word)
inverse_w2v = {v: k for k, v in word2vec.items()}
result_matrix = np.matmul(dt_matrix.A, tw_matrix.A)
bm25 = BM25Okapi(list(doc2word.values()))


def bm25_evaluate_query(query: List[str]):
    return bm25.get_scores(query)


def tfidf_evaluate_query(query: List[str]):
    tfidf = preprocessing.cal_tf_idf(cv_matrix)
    # model = TfidfTransformer()
    # tfidf = model.fit_transform(cv_matrix)