コード例 #1
0
def check_spelling_vsm(query, corpus):
    query_list = query.split(' ')
    corrected_query = query_list[:]
    spelling_error = False

    if corpus is corpus_enum.Corpus.COURSES:
        file_name = 'courseIndex.json'
    elif corpus is corpus_enum.Corpus.REUTERS:
        file_name = 'reutersIndex.json'

    path_to_index = os.path.join(os.getcwd(), 'searchapp', 'index_and_dict',
                                 file_name)
    index = indexAccess.getInvertedIndex(path_to_index=path_to_index)

    for i, term in enumerate(query_list):
        processed_term, ok = indexAndDictBuilder.preprocToken(term,
                                                              stopword=False,
                                                              stem=True,
                                                              norm=True)
        if processed_term not in index:
            spelling_error = True
            corrected_term = correct_spelling(term, index)
            corrected_query[i] = corrected_term

    if spelling_error:
        return ' '.join(corrected_query)
    return None
コード例 #2
0
def process_tokens(tokens, stopword, stem, norm):
    processed_tokens = []
    for token in tokens:
        token, ok = indexAndDictBuilder.preprocToken(token, stopword, stem,
                                                     norm)
        if not ok:
            continue
        processed_tokens.append(token)
    return processed_tokens
コード例 #3
0
def check_spelling_bool(query, corpus):
    query_list = query.split(' ')
    corrected_query = query_list[:]
    spelling_error = False

    if corpus is corpus_enum.Corpus.COURSES:
        file_name = 'courseIndex.json'
    elif corpus is corpus_enum.Corpus.REUTERS:
        file_name = 'reutersIndex.json'

    path_to_index = os.path.join(os.getcwd(), 'searchapp', 'index_and_dict',
                                 file_name)
    index = indexAccess.getInvertedIndex(path_to_index=path_to_index)

    for i, term in enumerate(query_list):
        front_pars_num = term.count('(')
        term = term.replace('(', '')
        rear_pars_num = term.count(')')
        term = term.replace(')', '')

        if term == 'AND' or term == 'OR' or term == 'AND_NOT' or "*" in term:
            continue

        processed_term, ok = indexAndDictBuilder.preprocToken(term,
                                                              stopword=False,
                                                              stem=True,
                                                              norm=True)
        if processed_term not in index:
            spelling_error = True
            corrected_term = correct_spelling(term, index)

            if front_pars_num > 0:
                term_par = ''
                for x in range(front_pars_num):
                    term_par += '('
                term_par += corrected_term
            elif rear_pars_num > 0:
                term_par = corrected_term
                for x in range(rear_pars_num):
                    term_par += ')'
            else:
                term_par = corrected_term

            corrected_query[i] = term_par

    if spelling_error:
        return ' '.join(corrected_query)
    return None
コード例 #4
0
def classify(f):
    """
        Classify one document
    """
    query = ""
    body = corpusAccess.getContentsReuters(f)
    tokens = nltk.word_tokenize(body)
    for token in tokens:
        # Apply same preprocessing as inverted index
        token, ok = ind.preprocToken(token, True, True, True)
        if not ok:
            continue
        query += " " + token

    # Input body of our document as the query to VSM
    # to get the similarity to every other document
    #
    # need_topics is set to True to ensure VSM only
    # ranks documents with topics (in our training set)
    #
    # We ask for the top 5 documents (k = 5)
    ranking = vsm.rank(query, 5, corpus_enum.Corpus.REUTERS, True, [])

    cand_topics = []
    # Get all the topics from our 5 documents
    for doc in ranking:
        topic = corpusAccess.getTopicsReuters(doc["docId"])
        if topic is None:
            continue
        # We get the topics as a string
        # Must split on comma to get each topic
        if "," in topic:
            new_cand_topics = topic.split(",")
            cand_topics += new_cand_topics
        else:
            cand_topics.append(topic)

    # From all of our topics take the topic that occurs the most
    c = Counter(cand_topics)
    tuple_topics = c.most_common(1)
    # The list comprehension is if we wanted to
    # classify our documents with more then one topic
    topics = [t for t, count in tuple_topics]
    return f, topics
コード例 #5
0
def get_suggestions(query, model, corpus, n_suggestions):
    formatted_query, ok = indexAndDictBuilder.preprocToken(query,
                                                           stopword=True,
                                                           stem=False,
                                                           norm=True)
    if not ok:
        return []

    if corpus == corpus_enum.Corpus.COURSES:
        bigram_path = 'searchapp/bigram_language_model/courses_bigram_language_model.json'
    elif corpus == corpus_enum.Corpus.REUTERS:
        bigram_path = 'searchapp/bigram_language_model/reuters_bigram_language_model.json'

    bigram_model = indexAccess.getInvertedIndex(bigram_path)

    try:
        suggestions = list(bigram_model[formatted_query]
                           ['conditional_words'].keys())[:n_suggestions]
    except:
        suggestions = []

    return suggestions
コード例 #6
0
def query_to_postfix(query, corpus):
    new_query = []
    op_stack = []
    query = query.split(' ')

    # stemming and wildcard management
    for i, word in enumerate(query[:]):
        if not (word == 'AND' or word == 'OR' or word == 'AND_NOT'):
            front_pars_num = word.count('(')
            if front_pars_num > 0:
                word = word.replace('(', '')
            rear_pars_num = word.count(')')
            if rear_pars_num > 0:
                word = word.replace(')', '')

            if '*' in word:
                formatted_words = handle_wildcard(word, corpus)
                if len(formatted_words) == 0:
                    query[i] = None
                for j, f_word in enumerate(formatted_words):
                    if j == 0:
                        pars = ''
                        if len(formatted_words) == 1 and rear_pars_num > 0:
                            for x in range(rear_pars_num):
                                pars += ')'
                            query[i] = f_word + pars
                        elif front_pars_num > 0:
                            for x in range(front_pars_num):
                                pars += '('
                            query[i] = pars + f_word
                        else:
                            query[i] = f_word
                    else:
                        if j == len(formatted_words) - 1 and rear_pars_num > 0:
                            pars = ''
                            for x in range(rear_pars_num):
                                pars += ')'
                            f_word += pars
                        query.insert(i + j, f_word)
            else:
                formatted_word, ok = indexAndDictBuilder.preprocToken(
                    word, stopword=False, stem=True, norm=True)

                if front_pars_num > 0:
                    word_par = ''
                    for x in range(front_pars_num):
                        word_par += '('
                    word_par += formatted_word
                elif rear_pars_num > 0:
                    word_par = formatted_word
                    for x in range(rear_pars_num):
                        word_par += ')'
                else:
                    word_par = formatted_word

                query[i] = word_par

    for token in query:
        if token == 'AND' or token == 'OR' or token == 'AND_NOT':
            op_stack.append(token)
        elif '(' in token:
            par_count = token.count('(')
            token = token.replace('(', '')
            new_query.append(token)
            for i in range(par_count):
                op_stack.append('(')
        elif ')' in token:
            par_count = token.count(')')
            token = token.replace(')', '')
            new_query.append(token)

            for i in range(par_count):
                top_of_stack = op_stack.pop()
                while top_of_stack != '(':
                    new_query.append(top_of_stack)
                    top_of_stack = op_stack.pop()
        else:
            new_query.append(token)

    for i in range(len(op_stack)):
        new_query.append(op_stack.pop())

    return new_query