def query_expansion(query: List[str], n_top_word: int = 10) -> List[str]: """ Expands a given query based on the word in the query Specifically for each word we find the frequency of word before and after and add n_top_word most frequent word to the query. :param query: List of words :param n_top_word: number of frequent word you want to add. :return: expanded query """ documents = utility.load_vector_file("Generated Files/doc2word.csv") result = [] for word in query: expanded_query = {} # append original word to query document_ids = [ ids for ids, values in documents.items() if word in values ] for new_id_doc in document_ids: # add window size neighboring words document = documents[new_id_doc] word_index = document.index(word) if word_index == 0: before_word = document[word_index] expanded_query[before_word] = expanded_query.get( before_word, 0) + 1 elif word_index == len(document) - 1: after_word = document[word_index] expanded_query[after_word] = expanded_query.get(after_word, 0) + 1 else: before_word = document[word_index - 1] after_word = document[word_index + 1] expanded_query[before_word] = expanded_query.get( before_word, 0) + 1 expanded_query[after_word] = expanded_query.get(after_word, 0) + 1 sorted_query_words = list( dict( sorted(expanded_query.items(), key=lambda x: x[1], reverse=True)).keys()) result.append(sorted_query_words[:n_top_word]) result.append(query) return list(set(itertools.chain.from_iterable(result)))
def cut_off_words(corpus, word_maximum_doc_percent, word_minimum_count, use_tfidf: bool = False): nltk.download('stopwords') stop_words = stopwords.words('danish') stop_words.extend( list(utility.load_vector_file("word_datasets/stopwords.csv").values())) if not use_tfidf: cv = CountVectorizer(max_df=word_maximum_doc_percent, min_df=word_minimum_count, stop_words=stop_words) cv_matrix = cv.fit_transform(corpus) words = key_dictionizer(cv.get_feature_names()) return words else: cv = CountVectorizer(stop_words=stop_words) cv_matrix = cv.fit_transform(corpus) words = key_dictionizer(cv.get_feature_names()) words = {v: k for k, v in words.items()} words = filter_tfidf(cv_matrix, words) return words
def query_run_with_expansion(): """ A running example of the expanded query and yield the topic distribution for each query generated. :return: """ vectorizer = sp.load_npz("generated_files/tfidf_matrix.npz") words = utility.load_vector_file("generated_files/word2vec.csv") dictionary = Dictionary([words.values()]) queries = generate_document_queries(vectorizer, words, 10, 4) expanded_queries = [] for query in tqdm(list(queries.items())): expanded_queries.append(query_expansion(query, 5)) lda = load_lda("LDA/model/document_model(83, None, 0.001)") topic_distributions = [] for exp_query in expanded_queries: topic_distributions.append( get_document_topics_from_model(exp_query, lda, dictionary, lda.num_topics)) for query, topic_dis in zip(expanded_queries, topic_distributions): print(f"Topic distribution: {topic_dis}")
def preprocess( filename_or_docs="2017_data.json", word_save_filename="generated_files/word2vec.csv", doc_save_filename="generated_files/doc2vec.csv", doc_word_save_filename="generated_files/doc2word.csv", doc_word_matrix_save_filename="generated_files/count_vec_matrix.npz", word_minimum_count=20, word_maximum_doc_percent=0.25, doc_minimum_length=20, save=True, word_check=True): """ preprocesses a json file into a docword count vectorization matrix, removing unhelpful words and documents. :param filename_or_docs: path of .json file to load (default: "documents.json") or the documents to preprocess :param word_save_filename: path of .csv file to save words in vector format. Only relevant if save=True. (default: "generated_files/word2vec.csv") :param doc_save_filename: path of .csv file to save documents in vector format. Only relevant if save=True. (default: "generated_files/doc2vec.csv") :param doc_word_save_filename: path of .csv file to map documents and contained words using ids. (default: "generated_files/doc2word.csv") :param doc_word_matrix_save_filename: path of the .npz file which contains the Count Vectorization matrix. (default: "generated_files/count_vec_matrix.npz") :param word_minimum_count: minimum amount a word must be used in the document set to be considered viable (default: 20). :param word_maximum_doc_percent: maximum percentage of documents that may contain a word for it to be considered viable (default: 0.25) :param doc_minimum_length: minimum amount of words for a document to be viable (default: 20). :param save: boolean indicating whether to save words and document files. :param word_check: boolean indicating whether to check words against word databases. Can be very slow when using new dataset, but is saved locally afterwards. :return: csr-matrix (sparse matrix) containing word frequencies for each document. """ print('Beginning Preprocessing Procedure.') step = 1 # load documents file print(f'Step {step}: loading documents.') # If filename_or_docs is a string, load documents from path, else continue as if given documents directly document_ids = load_document_file(filename_or_docs) if isinstance( filename_or_docs, str) else filename_or_docs # filter documents and create document set document_ids, documents = filter_documents(document_ids, doc_minimum_length) # cut off words that are used too often or too little (max/min document frequency) or are stop words step += 1 print(f'Step {step}: stop words and word frequency.') # Filter words based on rarity vocab = gensim.corpora.Dictionary(documents) vocab.filter_extremes(word_minimum_count, word_maximum_doc_percent) # Get stopwords nltk.download('stopwords') stop_words = stopwords.words('danish') stop_words.extend( list(utility.load_vector_file("word_datasets/stopwords.csv").values())) # Remove stopwords words = vocab.token2id bad_ids = [] for sw in stop_words: if sw in words: bad_ids.append(words[sw]) vocab.filter_tokens(bad_ids=bad_ids) vocab.compactify() if word_check: # cut off words that are not used in danish word databases or are wrong word type step += 1 print(f"Step {step}: word databases and POS-tagging.") words = vocab.token2id bad_ids = word_checker(words) for sw in stop_words: if sw in words: bad_ids.append(words[sw]) vocab.filter_tokens(bad_ids=bad_ids) vocab.compactify() # Stemming to combine word declensions step += 1 print(f"Step {step}: Apply Stemming / Lemming") words = list(vocab.token2id.keys()) vocab, documents = stem_lem(words, documents, stem_or_lem=False) for id, x in enumerate(documents): test = vocab.doc2idx(x) documents[id] = [x[i] for i in range(len(x)) if test[i] != -1] # transform documents into a matrix containing counts for each word in each document step += 1 print(f"Step {step}: doc-word matrix construction") words = list(vocab.token2id.keys()) cv = CountVectorizer(vocabulary=words) cv_matrix = cv.fit_transform([' '.join(x) for x in documents]) print("Matrix is: " + str(cv_matrix.shape)) if save: step += 1 print(f'Step {step}: saving files.') utility.save_vector_file(word_save_filename, words) utility.save_vector_file(doc_save_filename, document_ids.keys()) utility.save_vector_file(doc_word_save_filename, [' '.join(x) for x in documents]) vocab.save("generated_files/vocab") sparse.save_npz(doc_word_matrix_save_filename, cv_matrix) print('Finished Preprocessing Procedure.') return cv_matrix, vocab, documents
def stem_lem(words, documents, stem_or_lem: bool = False): """ Updates a word list and a corpus to use stemmed words. :param stem_or_lem: bool indicating whether to apply stemming or lemmatizer. True is stem, False is lem. :param corpus: a list of sentences (strings of words separated by spaces) :param words: a list of words :return: new corpus and words list, were all words have been replaced by stemmed/lemmetized versions. """ stop_words = stopwords.words('danish') stop_words.extend( list(utility.load_vector_file("word_datasets/stopwords.csv").values())) if stem_or_lem: # Stemming stemmer = DanishStemmer() # Update word list to use stemmed words translator = {} add = [] remove = [] for word in tqdm(words): stem = stemmer.stem(word) if stem != word: if word not in remove: remove.append(word) if stem not in add and stem not in stop_words: add.append(stem) if word not in translator and stem not in stop_words: translator[word] = stem words = [x for x in words if x not in remove] words.extend([x for x in add if x not in words]) else: lemmer = lemmy.load("da") # build up dictionary that translates old words into their new versions translator = {} add = [] remove = [] for word in tqdm(words): lem = lemmer.lemmatize("", word) other = [x for x in lem if x != word] if len(other) > 0: if word not in lem and word not in remove: remove.append(word) # add all lem options if they are not stopwords add.extend( [x for x in lem if x not in stop_words and x not in add]) if word not in translator and lem not in stop_words: lem = " ".join(lem) translator[word] = lem words = [x for x in words if x not in remove] words.extend([x for x in add if x not in words]) # update corpus to use stemmed words for x in tqdm(range(len(documents))): sentence = documents[x] for i in range(len(sentence)): word = sentence[i] if word in translator: sentence[i] = translator[word] sentence = ' '.join(sentence) sentence = sentence.split(' ') documents[x] = sentence diction = gensim.corpora.Dictionary(documents) d_words = diction.token2id good_ids = [d_words[x] for x in words] diction.filter_tokens(good_ids=good_ids) diction.compactify() return diction, documents
import random from typing import Dict, List import numpy as np import scipy.sparse as sp from gensim.corpora import Dictionary from sklearn.feature_extraction.text import TfidfTransformer from sklearn.preprocessing import normalize from tqdm import tqdm import preprocessing import utility from models import lda from models.lda import get_document_topics_from_model, load_lda doc2word = utility.load_vector_file("generated_files/doc2word.csv") word2vec = utility.load_vector_file("generated_files/word2vec.csv") inverse_w2v = {v: k for k, v in word2vec.items()} def evaluate_query_doc(function, query: List[str], document_index: int): """ Evaluate a query based on a function and document index :param function: the evaluation function :param query: the list of query words :param document_index: the index of the document :return: the product of the evaluate function """ p_wd = [] for word in query: word_index = inverse_w2v[word]
import numpy as np import scipy.sparse as sp from rank_bm25 import BM25Okapi from sklearn.preprocessing import normalize from tqdm import tqdm import preprocessing import query_handling import utility cv_matrix = sp.load_npz("Generated Files/count_vec_matrix.npz") dt_matrix = sp.load_npz("Generated Files/(30, 0.1, 0.1)topic_doc_matrix.npz") tw_matrix = sp.load_npz("Generated Files/(30, 0.1, 0.1)topic_word_matrix.npz") wordfreq = np.array(cv_matrix.sum(axis=0))[0] doc2word = utility.load_vector_file("Generated Files/doc2word.csv") word2vec = utility.load_vector_file("Generated Files/word2vec.csv") dirichlet_smoothing = sum([len(i) for i in list(doc2word.values())]) / len(doc2word) inverse_w2v = {v: k for k, v in word2vec.items()} result_matrix = np.matmul(dt_matrix.A, tw_matrix.A) bm25 = BM25Okapi(list(doc2word.values())) def bm25_evaluate_query(query: List[str]): return bm25.get_scores(query) def tfidf_evaluate_query(query: List[str]): tfidf = preprocessing.cal_tf_idf(cv_matrix) # model = TfidfTransformer() # tfidf = model.fit_transform(cv_matrix)