Ejemplo n.º 1
0
    def preprocessing(self):
        ''' apply the same preprocessing steps used by indexing,
            also use the provided spelling corrector. Note that
            spelling corrector should be applied before stopword
            removal and stemming (why?)'''

        #ToDo: return a list of terms

        #lower-case query
        self.raw_query = self.raw_query.lower()

        #eliminate numbers
        self.raw_query = re.sub(r'\d+', '', self.raw_query)

        #tokenizing
        tokenizer = RegexpTokenizer(r'\w+')
        tokens = tokenizer.tokenize(self.raw_query)

        self.words = []

        #spell correction, stop word removal, stemming
        for i in tokens:
            i = norvig_spell.correction(i)
            if util.isStopWord(i) == False:
                self.words.append(util.stemming(i))
Ejemplo n.º 2
0
    def preprocessing(self):
        ''' apply the same preprocessing steps used by indexing,
            also use the provided spelling corrector. Note that
            spelling corrector should be applied before stopword
            removal and stemming (why?)'''

        #ToDo: return a list of terms


        for q in self.raw_query:
            query_tokens = []
            stemmed_query_tokens = []
#            print(q, self.raw_query[q].text)
         #   query_tokens = re.split(" ", self.raw_query[q].text.replace('\n', ' '))
            query_tokens = word_tokenize(self.raw_query[q].text)
            query_tokens = [element.lower() for element in query_tokens];
            tempcounter = 0
            while tempcounter < len(query_tokens):
                query_tokens[tempcounter] = correction(query_tokens[tempcounter]);
                tempcounter = tempcounter + 1
            ps = PorterStemmer()
            temp = 0
            querytokentemp = 0
            while temp < len(query_tokens):

                query_tokens[temp] = ps.stem(query_tokens[temp])
                querytokentemp = querytokentemp + 1
                with open("stopwords") as f:
                    for line in f:
                        if line.strip() == query_tokens[temp]:
                            query_tokens.remove(line.strip())
                            temp = temp - 1
                temp = temp + 1
            self.preprocessed_query_tokens[q] = query_tokens
Ejemplo n.º 3
0
    def preprocessing(self):
        ''' apply the same preprocessing steps used by indexing,
            also use the provided spelling corrector. Note that
            spelling corrector should be applied before stopword
            removal and stemming (why?)'''

        #ToDo: return a list of terms

        # Tokenize and lowercase doc into list form
        token_list = util.tokenize_doc(self.raw_query)

        # Helper function to replace stopwords with empty string
        def remove_stop_word(tok):
            return "" if util.isStopWord(tok) else tok

        # Correct spelling of each word
        tokens_corrected_spell = list(
            map(lambda tok: correction(tok), token_list))

        # Remove the stopwords from both positional list and token list
        token_list_no_stopword = list(
            map(remove_stop_word, tokens_corrected_spell))

        # Stem the words
        stemmed_token_list = list(
            map(lambda tok: util.stemming(tok), token_list_no_stopword))

        return stemmed_token_list
Ejemplo n.º 4
0
    def preprocessing(self, qid):
        ''' apply the same preprocessing steps used by indexing,
            also use the provided spelling corrector. Note that
            spelling corrector should be applied before stopword
            removal and stemming (why?)'''

        qbody = self.raw_query
        qbody = qbody.get(qid)  # self.convertFromMap(qbody)   #self.docs
        print("Below is the query: ")
        print(qbody.text)
        #in query.text, there are no 005, etc due to this exception will
        # be thrown.
        try:
            qbody = re.sub("[^a-z0-9]+", " ", str(qbody.text))
        except Exception:
            print("Query ID which is not having text: ", qid)
            raise

        tokens = nltk.tokenize.word_tokenize(qbody)

        corrected_tokens = [correction(word) for word in tokens] #spell check
        converted_tokens = [word.lower() for word in corrected_tokens]
        #below query will not have stop words
        clean_query = []

        for word in converted_tokens:  #removing stop words
            if util.isStopWord(word):
                clean_query.append(util.stemming(word))
        if len(clean_query) > 0:
            self.query.append(clean_query)

        print("Query after spell check and  removing the stop words: ", self.query)
Ejemplo n.º 5
0
 def tokenize_text_for_q(self, doc):
     list_token = []
     tokenizer = RegexpTokenizer(r'\w+')
     list_token = tokenizer.tokenize(doc)
     # because of the limited size of our corpus, spelling correction results in slight boost
     # with a larger corpus you would not do this, especially due to the simplicity of the spelling correction
     list_token = [
         correction(word.lower())
         if word not in self.known_words else word.lower()
         for word in list_token
     ]
     return list_token
Ejemplo n.º 6
0
 def preprocessing(self):
     """ apply the same preprocessing steps used by indexing,
         also use the provided spelling corrector. Note that
         spelling corrector should be applied before stopword
         removal and stemming (why?)"""
     print(self.raw_query)
     tokens = word_tokenize(self.raw_query)
     alpha_tokens = [
         norvig_spell.correction(token) for token in tokens
         if token.isalpha()
     ]  # tokenizing the query,norvig_spell check and removing punctuations
     self.tokens = [
         util.stemming(token.lower()) for token in alpha_tokens
         if not util.isStopWord(token)
     ]  # remove stopwords
     return self.tokens
    def preprocessing(self):
        ''' apply the same preprocessing steps used by indexing,
            also use the provided spelling corrector. Note that
            spelling corrector should be applied before stopword
            removal and stemming (why?)'''

        #ToDo: return a list of terms
        for x in self.raw_query:
            lower_case = util.query_lower(self.raw_query[x].text)
            lower_case = list(
                map(lambda y: norvig_spell.correction(y), lower_case)
            )  #spelling correction is done before stemming and removing of stop words
            lower_case = list(map(lambda y: util.stemming(y), lower_case))
            for i in list(lower_case):
                if util.isStopWord(i):
                    lower_case.remove(i)
            QueryProcessor.preprocessed_query[x] = lower_case
    def preprocessing(self):
        ''' apply the same preprocessing steps used by indexing,
            also use the provided spelling corrector'''

        #ToDo: return a list of terms
        'tokenization of query along with removal of punctuation'
        tokenizer = RegexpTokenizer(r'\w+')
        querytoken = tokenizer.tokenize(self.raw_query)
        '''checking for spell error in query tokens and making corrections and 
        storing the words in Query dictionary'''
        for token in querytoken:
            to_lower = ''.join(
                norvig_spell.words(token))  #converting list to string
            spellcorrection = norvig_spell.correction(to_lower)
            Query.append(spellcorrection)
            stopword = isStopWord(spellcorrection)
            if not stopword:
                stemqueryterm = stemming(spellcorrection)
                Queryterm.append(stemqueryterm)
Ejemplo n.º 9
0
    def preprocessing(self):

        tokenizer = RegexpTokenizer(r'\w+')
        querytokens = tokenizer.tokenize(self.raw_query)
        self.q_tf_dino = len(querytokens)  #length of the query
        #make spell correction
        for counter, querytoken in enumerate(querytokens):

            querytokens[counter] = norvig_spell.correction(querytoken)

        #replace the token after stemming
        for counter, querytoken in enumerate(querytokens):
            stemmed_term = stemming(querytoken)
            querytokens[counter] = stemmed_term
            #remove stop words from token
            token_is_stopword = isStopWord(querytoken)
            if (token_is_stopword):
                querytokens.pop(counter)

        print("Query tokens", querytokens)
        return querytokens
Ejemplo n.º 10
0
    def preprocessing(self):
        ''' apply the same preprocessing steps used by indexing,
            also use the provided spelling corrector. Note that
            spelling corrector should be applied before stopword
            removal and stemming (why?)'''

        #ToDo: return a list of terms

        corrected_terms_list = list()
        for term in self.raw_query.split(' '):  #splitting on white space
            corrected_terms_list.append(norvig_spell.correction(term))
        try:
            corrected_terms_list.remove(
                'gw'
            )  #since we used the Cranfield dataset for spelling correction, 'gw' appeared and we remove here
        except:
            pass
        corrected_terms_text = ' '.join(corrected_terms_list)

        terms = util.splitDoc(corrected_terms_text)

        return terms  #list of terms
Ejemplo n.º 11
0
    def preprocessing(self, qid):
        ''' apply the same preprocessing steps used by indexing,
            also use the provided spelling corrector. Note that
            spelling corrector should be applied before stopword
            removal and stemming (why?)'''

        #ToDo: return a list of terms

        qbody = self.raw_query
        cqObj = CranFile('query.text')
        qbody = qbody.get(qid)  #self.convertFromMap(qbody)   #self.docs
        #print(qbody)
        try:
            qbody = re.sub("[^a-z0-9]+", " ", str(qbody.text))
        except Exception:
            print("Query ID which is not having text: ", qid)
            raise

        reduced = nltk.tokenize.word_tokenize(qbody)
        '''for words in reduced:
            # reduced terms are passed through stopwords and stemming in util
            if util.isStopWord(words):
                self.query.append(util.stemming(words).lower())
        # normalized terms are stored in reducedList Dictionary
        print("1...", self.query)'''

        correctedwords = [correction(word) for word in reduced]
        lowercasewords = [word.lower() for word in correctedwords]
        notstopwords = []
        for word in lowercasewords:
            if util.isStopWord(word):
                notstopwords.append(util.stemming(word))
        if len(notstopwords) > 0:
            self.query.append(notstopwords)

        print("1...", self.query)
Ejemplo n.º 12
0
 def correct(self, sentence, position):
     word = sentence[position]
     import norvig_spell
     return norvig_spell.correction(word)
Ejemplo n.º 13
0
def spellcheck(word):
    spellcheckedwords = []
    for x in word:
        wrd = correction(x)
        spellcheckedwords.append(wrd)
    return spellcheckedwords
Ejemplo n.º 14
0
    def vectorQuery(self, k):
        ''' vector query processing, using the cosine similarity. '''
        #ToDo: return top k pairs of (docID, similarity), ranked by their cosine similarity with the query in the descending order
        # You can use term frequency or TFIDF to construct the vectors
        #constructing document vector for document 1
        vectorResult = []
        cf = CranFile('cran.all')
        documentVector = {}
        queryVector = {}
        ps = PorterStemmer()
        finalResult = {}
        for q in self.raw_query:
            if q == self.queryId:
                query_tokens = []
                stemmed_query_tokens = []
                #            print(q, self.raw_query[q].text)
                #   query_tokens = re.split(" ", self.raw_query[q].text.replace('\n', ' '))
                query_tokens = word_tokenize(self.raw_query[q].text)
                query_tokens = [element.lower() for element in query_tokens];
                tempcounter = 0
                while tempcounter < len(query_tokens):
                    query_tokens[tempcounter] = correction(query_tokens[tempcounter]);
                    tempcounter = tempcounter + 1
                ps = PorterStemmer()
                temp = 0
                querytokentemp = 0
                while temp < len(query_tokens):

                    query_tokens[temp] = ps.stem(query_tokens[temp])
                    querytokentemp = querytokentemp + 1
                    with open("stopwords") as f:
                        for line in f:
                            if line.strip() == query_tokens[temp]:
                                query_tokens.remove(line.strip())
                                temp = temp - 1
                    temp = temp + 1

                #block to calculate query vector start

                temp2 = 0
                while temp2 < len(query_tokens):
                    if query_tokens[temp2] in self.index.items:
                        wordfreq = [query_tokens.count(query_tokens[temp2])]
       #                 print(wordfreq)
                        queryVector[query_tokens[temp2]] = (self.index.items[query_tokens[temp2]].get('idf') )* (1 + math.log( wordfreq[0] , 10))
                        temp2 = temp2 + 1
                    else:
                        queryVector[query_tokens[temp2]] = 0;
                        temp2 = temp2 + 1
                #block to calculate query vector end
                docidScorepair = {}
                for doc in cf.docs:
    #                print(doc.docID, doc.title, doc.body)

    #                print("generating document vector here")
                    titletoken = word_tokenize(doc.title)
                    bodytoken = word_tokenize(doc.body)
                    tokens = titletoken + bodytoken
                    tokens = [element.lower() for element in tokens];
                    temp3 = 0
                    while temp3 < len(tokens):
                        with open("stopwords") as f:
                            for line in f:
                                if line.strip() == tokens[temp3]:
                                    tokens.remove(line.strip())
                                    temp3 = temp3 - 1
                        temp3 = temp3 + 1
                    temp = 0
                    while temp < len(tokens):
                        tokens[temp] = ps.stem(tokens[temp])
                        temp = temp + 1
                    temp2 = 0
                    while temp2 < len(tokens):
                        if tokens[temp2] in self.index.items:
                            documentVector[tokens[temp2]] = (1 + math.log(self.index.items[tokens[temp2]].get('posting').get(doc.docID).get('termfreq'),10)) * (self.index.items[tokens[temp2]].get('idf'))
                            temp2 = temp2 + 1
                        else:
                            documentVector[tokens[temp2]] = 0;
                            temp2 = temp2 + 1
    #                print('document vector complete')
                    #print(documentVector)
                    # without normalization

                    #normalize query vector and document vector start
                    normalizequeryvectorcounter = 0
                    queryVectornormalized = []
#                    sumofsquaresquery = 0
#                    for z in queryVector:
#                        sumofsquaresquery =  sumofsquaresquery + np.multiply(queryVector[z] , queryVector[z])
#
#                    sumofsquaresquery = 1 / math.sqrt(sumofsquaresquery)


#                    for r in queryVector:
#                        queryVector[r] = queryVector[r] *  sumofsquaresquery

                    sumofsquaresdocument = 0
                    for l in documentVector:
                        sumofsquaresdocument = sumofsquaresdocument + np.multiply(documentVector[l], documentVector[l])
                    try:
                        sumofsquaresdocument = 1 / math.sqrt(sumofsquaresdocument)

                    except:
                        sumofsquaresdocument = 0
                    for h in documentVector:
                        documentVector[h] = documentVector[h] * sumofsquaresdocument
                    #noramlize ends

                    cosineVector = queryVector.copy()
                    for u in queryVector:
                        if u in documentVector:
                            cosineVector[u] = np.multiply(documentVector[u], queryVector[u])
                        else:
                            #below line is wrong
#                            cosineVector[k] = queryVector[k]
                            cosineVector[u] = 0
#                    print ("query vector -->")
#                    print(queryVector)
#                    print ("document vector -->")
#                    print( documentVector)
#                    print ("cosine vector -->")
#                    print(cosineVector)
#                    print ("****************************")
                    # document score

                    docidScorepair[doc.docID] = sum(cosineVector.values())
                    #end of document score

                    self.intermediateResultVectorQuery[q] = docidScorepair

                    cosineVector = {}
                    #end without normalization

                    documentVector = {}
                queryVector = {}

#                print(query_tokens)
                counterObject = Counter(self.intermediateResultVectorQuery[q])
                high = counterObject.most_common(k)
#                print('*** query id ***'+q + "***** query text *****" +self.raw_query[q].text)
                if k == 3:
                    print(high)
                vectorResult = [i[0] for i in counterObject.most_common(k)]
                #                print(vectorResult)
        return vectorResult
Ejemplo n.º 15
0
 def spell_correction(self, list_token):
     temp = [correction(item) for item in list_token]
     return temp