def check_spelling_vsm(query, corpus): query_list = query.split(' ') corrected_query = query_list[:] spelling_error = False if corpus is corpus_enum.Corpus.COURSES: file_name = 'courseIndex.json' elif corpus is corpus_enum.Corpus.REUTERS: file_name = 'reutersIndex.json' path_to_index = os.path.join(os.getcwd(), 'searchapp', 'index_and_dict', file_name) index = indexAccess.getInvertedIndex(path_to_index=path_to_index) for i, term in enumerate(query_list): processed_term, ok = indexAndDictBuilder.preprocToken(term, stopword=False, stem=True, norm=True) if processed_term not in index: spelling_error = True corrected_term = correct_spelling(term, index) corrected_query[i] = corrected_term if spelling_error: return ' '.join(corrected_query) return None
def process_tokens(tokens, stopword, stem, norm): processed_tokens = [] for token in tokens: token, ok = indexAndDictBuilder.preprocToken(token, stopword, stem, norm) if not ok: continue processed_tokens.append(token) return processed_tokens
def check_spelling_bool(query, corpus): query_list = query.split(' ') corrected_query = query_list[:] spelling_error = False if corpus is corpus_enum.Corpus.COURSES: file_name = 'courseIndex.json' elif corpus is corpus_enum.Corpus.REUTERS: file_name = 'reutersIndex.json' path_to_index = os.path.join(os.getcwd(), 'searchapp', 'index_and_dict', file_name) index = indexAccess.getInvertedIndex(path_to_index=path_to_index) for i, term in enumerate(query_list): front_pars_num = term.count('(') term = term.replace('(', '') rear_pars_num = term.count(')') term = term.replace(')', '') if term == 'AND' or term == 'OR' or term == 'AND_NOT' or "*" in term: continue processed_term, ok = indexAndDictBuilder.preprocToken(term, stopword=False, stem=True, norm=True) if processed_term not in index: spelling_error = True corrected_term = correct_spelling(term, index) if front_pars_num > 0: term_par = '' for x in range(front_pars_num): term_par += '(' term_par += corrected_term elif rear_pars_num > 0: term_par = corrected_term for x in range(rear_pars_num): term_par += ')' else: term_par = corrected_term corrected_query[i] = term_par if spelling_error: return ' '.join(corrected_query) return None
def classify(f): """ Classify one document """ query = "" body = corpusAccess.getContentsReuters(f) tokens = nltk.word_tokenize(body) for token in tokens: # Apply same preprocessing as inverted index token, ok = ind.preprocToken(token, True, True, True) if not ok: continue query += " " + token # Input body of our document as the query to VSM # to get the similarity to every other document # # need_topics is set to True to ensure VSM only # ranks documents with topics (in our training set) # # We ask for the top 5 documents (k = 5) ranking = vsm.rank(query, 5, corpus_enum.Corpus.REUTERS, True, []) cand_topics = [] # Get all the topics from our 5 documents for doc in ranking: topic = corpusAccess.getTopicsReuters(doc["docId"]) if topic is None: continue # We get the topics as a string # Must split on comma to get each topic if "," in topic: new_cand_topics = topic.split(",") cand_topics += new_cand_topics else: cand_topics.append(topic) # From all of our topics take the topic that occurs the most c = Counter(cand_topics) tuple_topics = c.most_common(1) # The list comprehension is if we wanted to # classify our documents with more then one topic topics = [t for t, count in tuple_topics] return f, topics
def get_suggestions(query, model, corpus, n_suggestions): formatted_query, ok = indexAndDictBuilder.preprocToken(query, stopword=True, stem=False, norm=True) if not ok: return [] if corpus == corpus_enum.Corpus.COURSES: bigram_path = 'searchapp/bigram_language_model/courses_bigram_language_model.json' elif corpus == corpus_enum.Corpus.REUTERS: bigram_path = 'searchapp/bigram_language_model/reuters_bigram_language_model.json' bigram_model = indexAccess.getInvertedIndex(bigram_path) try: suggestions = list(bigram_model[formatted_query] ['conditional_words'].keys())[:n_suggestions] except: suggestions = [] return suggestions
def query_to_postfix(query, corpus): new_query = [] op_stack = [] query = query.split(' ') # stemming and wildcard management for i, word in enumerate(query[:]): if not (word == 'AND' or word == 'OR' or word == 'AND_NOT'): front_pars_num = word.count('(') if front_pars_num > 0: word = word.replace('(', '') rear_pars_num = word.count(')') if rear_pars_num > 0: word = word.replace(')', '') if '*' in word: formatted_words = handle_wildcard(word, corpus) if len(formatted_words) == 0: query[i] = None for j, f_word in enumerate(formatted_words): if j == 0: pars = '' if len(formatted_words) == 1 and rear_pars_num > 0: for x in range(rear_pars_num): pars += ')' query[i] = f_word + pars elif front_pars_num > 0: for x in range(front_pars_num): pars += '(' query[i] = pars + f_word else: query[i] = f_word else: if j == len(formatted_words) - 1 and rear_pars_num > 0: pars = '' for x in range(rear_pars_num): pars += ')' f_word += pars query.insert(i + j, f_word) else: formatted_word, ok = indexAndDictBuilder.preprocToken( word, stopword=False, stem=True, norm=True) if front_pars_num > 0: word_par = '' for x in range(front_pars_num): word_par += '(' word_par += formatted_word elif rear_pars_num > 0: word_par = formatted_word for x in range(rear_pars_num): word_par += ')' else: word_par = formatted_word query[i] = word_par for token in query: if token == 'AND' or token == 'OR' or token == 'AND_NOT': op_stack.append(token) elif '(' in token: par_count = token.count('(') token = token.replace('(', '') new_query.append(token) for i in range(par_count): op_stack.append('(') elif ')' in token: par_count = token.count(')') token = token.replace(')', '') new_query.append(token) for i in range(par_count): top_of_stack = op_stack.pop() while top_of_stack != '(': new_query.append(top_of_stack) top_of_stack = op_stack.pop() else: new_query.append(token) for i in range(len(op_stack)): new_query.append(op_stack.pop()) return new_query