def indexDoc(self, doc):  # indexing a Document object
        ''' indexing a docuemnt, using the simple SPIMI algorithm, but no need to store blocks due to the small collection we are handling. Using save/load the whole index instead'''

        # ToDo: indexing only title and body; use some functions defined in util.py

        titletoken = word_tokenize(doc.subject)
        bodytoken = word_tokenize(doc.body)
        tokens = titletoken + bodytoken

        for counter, token in enumerate(tokens):
            #remove stop words from token
            token_is_stopword = isStopWord(token)
            if (token_is_stopword):
                tokens.pop(counter)
                continue
            #perform stemming
            stemmedToken = stemming(token)
            positionindoc = counter
            tokens[counter] = stemmedToken
            tempindexitem = IndexItem(tokens[counter])
            if (stemmedToken in self.items):
                self.items.get(stemmedToken).add(doc.docID, positionindoc,
                                                 doc.class_name)
            else:
                tempindexitem.add(doc.docID, positionindoc, doc.class_name)
                self.items[stemmedToken] = tempindexitem
            positionindoc = positionindoc + len(tokens[counter]) + 1
Exemple #2
0
    def preprocessing(self, qid):
        ''' apply the same preprocessing steps used by indexing,
            also use the provided spelling corrector. Note that
            spelling corrector should be applied before stopword
            removal and stemming (why?)'''

        qbody = self.raw_query
        qbody = qbody.get(qid)  # self.convertFromMap(qbody)   #self.docs
        print("Below is the query: ")
        print(qbody.text)
        #in query.text, there are no 005, etc due to this exception will
        # be thrown.
        try:
            qbody = re.sub("[^a-z0-9]+", " ", str(qbody.text))
        except Exception:
            print("Query ID which is not having text: ", qid)
            raise

        tokens = nltk.tokenize.word_tokenize(qbody)

        corrected_tokens = [correction(word) for word in tokens] #spell check
        converted_tokens = [word.lower() for word in corrected_tokens]
        #below query will not have stop words
        clean_query = []

        for word in converted_tokens:  #removing stop words
            if util.isStopWord(word):
                clean_query.append(util.stemming(word))
        if len(clean_query) > 0:
            self.query.append(clean_query)

        print("Query after spell check and  removing the stop words: ", self.query)
Exemple #3
0
    def preprocessing(self):
        ''' apply the same preprocessing steps used by indexing,
            also use the provided spelling corrector. Note that
            spelling corrector should be applied before stopword
            removal and stemming (why?)'''

        #ToDo: return a list of terms

        # Tokenize and lowercase doc into list form
        token_list = util.tokenize_doc(self.raw_query)

        # Helper function to replace stopwords with empty string
        def remove_stop_word(tok):
            return "" if util.isStopWord(tok) else tok

        # Correct spelling of each word
        tokens_corrected_spell = list(
            map(lambda tok: correction(tok), token_list))

        # Remove the stopwords from both positional list and token list
        token_list_no_stopword = list(
            map(remove_stop_word, tokens_corrected_spell))

        # Stem the words
        stemmed_token_list = list(
            map(lambda tok: util.stemming(tok), token_list_no_stopword))

        return stemmed_token_list
Exemple #4
0
    def preprocessing(self):
        ''' apply the same preprocessing steps used by indexing,
            also use the provided spelling corrector. Note that
            spelling corrector should be applied before stopword
            removal and stemming (why?)'''

        #ToDo: return a list of terms

        #lower-case query
        self.raw_query = self.raw_query.lower()

        #eliminate numbers
        self.raw_query = re.sub(r'\d+', '', self.raw_query)

        #tokenizing
        tokenizer = RegexpTokenizer(r'\w+')
        tokens = tokenizer.tokenize(self.raw_query)

        self.words = []

        #spell correction, stop word removal, stemming
        for i in tokens:
            i = norvig_spell.correction(i)
            if util.isStopWord(i) == False:
                self.words.append(util.stemming(i))
 def indexDoc(self, doc): # indexing a Document object
     ''' indexing a docuemnt, using the simple SPIMI algorithm, but no need to store blocks due to the small collection we are handling. Using save/load the whole index instead'''
     listoftokens=[]
     listofterms=[]
     psdocpos={}
     values=[]
     #tokenize the document body
     tokenizer = RegexpTokenizer(r'\w+')
     tokens = tokenizer.tokenize(doc.body)
     self.len_body=len(tokens)
     listoftokens+=tokens
     
    
         
     '''replace the token after stemming '''       
     for counter,token in enumerate(tokens):
         #self.tf=0
         stemmed_term = stemming(token)
         posi=counter
         tokens[counter]=stemmed_term  
         #remove stop words from token
         token_is_stopword=isStopWord(token)
         if(token_is_stopword):
             tokens.pop(counter)
             continue
          
         
         
         
         indexitem = IndexItem(stemmed_term)
         indexitem.add(doc.docID,posi)
         
         psdocpos= {doc.docID:[posi]}
         
         ''' if document contains the term'''
         if  self.items.__contains__(stemmed_term):
             
             #if same term in same document
             if(self.items[stemmed_term].__contains__(doc.docID)):
                 self.tf+=1/self.len_body
                 values=self.items.get(stemmed_term)
                 values[doc.docID].append(posi)
                 val=self.term_freq.get(stemmed_term)
                 val[doc.docID]=self.tf
             #if same term in different document    
             else:
                 self.tf=1/self.len_body
                 self.items[stemmed_term].update(psdocpos)
                 self.term_freq[stemmed_term].update({doc.docID:self.tf})
         #insert the new term and posting    
         else:
             self.tf=1/self.len_body
             self.items.update({stemmed_term:psdocpos}) 
             self.term_freq.update({stemmed_term:{doc.docID:self.tf}})
     listofterms+=tokens
Exemple #6
0
    def indexDoc(self, doc):  # indexing a Document object
        ''' indexing a docuemnt, using the simple SPIMI algorithm, but no need to store blocks due to the small collection we are handling. Using save/load the whole index instead'''

        # Using the SPIMI algorithm as defined at
        # https://nlp.stanford.edu/IR-book/html/htmledition/single-pass-in-memory-indexing-1.html

        # Each term in a doc has its own index item!!!

        # Preprocess first...
        # Call tokenize_doc to convert doc title and body into tokenized list, in lowercase
        # Do remove stopwords and stemming as expected

        # ToDo: indexing only title and body; use some functions defined in util.py
        # (1) convert to lower cases,
        # (2) remove stopwords,
        # (3) stemming

        # Then go term-by-term and create the index. Use algorithm to track which terms already in index, add new ones if not. If we create a new index item, add it to the self.items dict!!!

        # ---

        # Increment number of documents indexed
        self.nDocs += 1

        # Grab title and body of doc, merge into one string
        doc_string = doc.body

        # Tokenize and lowercase doc into list form
        token_list = util.tokenize_doc(doc_string)

        # Helper function to replace stopwords with empty string
        def remove_stop_word(tok):
            return "" if util.isStopWord(tok) else tok

        # Remove the stopwords from both positional list and token list
        token_list_no_stopword = list(map(remove_stop_word, token_list))

        # Stem the words
        stemmed_token_list = list(
            map(lambda tok: util.stemming(tok), token_list_no_stopword))

        # Note that the stemmed tokens are now our terms
        for pos, term in enumerate(stemmed_token_list):
            # Skip over stopwords, now replaced by ""
            if term == "": continue

            # If this term has already appeared, update the existing posting
            if not term in self.items:
                self.items[term] = IndexItem(term)
            self.items[term].add(int(doc.docID), pos)
 def preprocessing(self):
     """ apply the same preprocessing steps used by indexing,
         also use the provided spelling corrector. Note that
         spelling corrector should be applied before stopword
         removal and stemming (why?)"""
     print(self.raw_query)
     tokens = word_tokenize(self.raw_query)
     alpha_tokens = [
         norvig_spell.correction(token) for token in tokens
         if token.isalpha()
     ]  # tokenizing the query,norvig_spell check and removing punctuations
     self.tokens = [
         util.stemming(token.lower()) for token in alpha_tokens
         if not util.isStopWord(token)
     ]  # remove stopwords
     return self.tokens
    def preprocessing(self):
        ''' apply the same preprocessing steps used by indexing,
            also use the provided spelling corrector. Note that
            spelling corrector should be applied before stopword
            removal and stemming (why?)'''

        #ToDo: return a list of terms
        for x in self.raw_query:
            lower_case = util.query_lower(self.raw_query[x].text)
            lower_case = list(
                map(lambda y: norvig_spell.correction(y), lower_case)
            )  #spelling correction is done before stemming and removing of stop words
            lower_case = list(map(lambda y: util.stemming(y), lower_case))
            for i in list(lower_case):
                if util.isStopWord(i):
                    lower_case.remove(i)
            QueryProcessor.preprocessed_query[x] = lower_case
    def preprocessing(self):
        ''' apply the same preprocessing steps used by indexing,
            also use the provided spelling corrector'''

        #ToDo: return a list of terms
        'tokenization of query along with removal of punctuation'
        tokenizer = RegexpTokenizer(r'\w+')
        querytoken = tokenizer.tokenize(self.raw_query)
        '''checking for spell error in query tokens and making corrections and 
        storing the words in Query dictionary'''
        for token in querytoken:
            to_lower = ''.join(
                norvig_spell.words(token))  #converting list to string
            spellcorrection = norvig_spell.correction(to_lower)
            Query.append(spellcorrection)
            stopword = isStopWord(spellcorrection)
            if not stopword:
                stemqueryterm = stemming(spellcorrection)
                Queryterm.append(stemqueryterm)
Exemple #10
0
    def indexDoc(self, doc):
        # ToDo: indexing only title and body; use some functions defined in util.py
        # (1) convert to lower cases,
        # (2) remove stopwords,
        # (3) stemming

        doc_body = doc.body
        doc_docid = doc.docID
        tokens = self.tokenization(doc_body)
        reduced_list = []
        #storing stopwords in array
        stop_words = util.reading_stop_words()

        for word in tokens:
            if word not in stop_words:
                reduced_list.append(util.stemming(word))

        #print(reduced_list)
        pos_dic = self.make_word_postition_dictionary(reduced_list)
        self.make_word_document_dictionaray(pos_dic, doc_docid)
Exemple #11
0
    def preprocessing(self):

        tokenizer = RegexpTokenizer(r'\w+')
        querytokens = tokenizer.tokenize(self.raw_query)
        self.q_tf_dino = len(querytokens)  #length of the query
        #make spell correction
        for counter, querytoken in enumerate(querytokens):

            querytokens[counter] = norvig_spell.correction(querytoken)

        #replace the token after stemming
        for counter, querytoken in enumerate(querytokens):
            stemmed_term = stemming(querytoken)
            querytokens[counter] = stemmed_term
            #remove stop words from token
            token_is_stopword = isStopWord(querytoken)
            if (token_is_stopword):
                querytokens.pop(counter)

        print("Query tokens", querytokens)
        return querytokens
Exemple #12
0
def test():
    ''' test your code thoroughly. put the testing cases here'''
    '''test code to test wheather the NLTK process the document/sentence and return tokens without punctuation'''
    tokenizer = RegexpTokenizer(r'\w+')
    string = tokenizer.tokenize("this is me checking , . (tokenization ' "
                                ")/\ ")
    print(string)
    i = ['you', 'i', 'me', 'my', 'myself', 'bad', 'good']
    'checking for if function returns true for real stop words'
    for i in i:
        value = isStopWord(i)
        if (value == True):
            print("stopword", i)
        else:
            print("Not stopword", i)
    stem = ['stemming', 'cars', 'experimental', 'coming']
    rootwords = []
    for s in stem:
        stemword = stemming(s)
        rootwords.append(stemword)
    print("words post stemming")
    print(rootwords)
    print('Pass')
def preprocess_doc(doc):
    """Get the words back out of the file"""
    # Read in doc, only get subject and body of the document
    with open(doc) as f:
        doc_lines = f.readlines()

    sub_body_lines = []
    for l in doc_lines:
        if l.startswith("Subject:"):
            sub_body_lines.append(l[9:].strip())
        if l.startswith("Lines:"):
            num_lines = int(l[7:])

    for l in doc_lines[len(doc_lines) - num_lines:]:
        if l is not "\n":
            sub_body_lines.append(l.strip())

    # Process all the words again
    # Get doc string
    doc_string = " ".join(sub_body_lines)

    # Tokenize and lowercase doc into list form
    token_list = util.tokenize_doc(doc_string)

    # Helper function to replace stopwords with empty string
    def remove_stop_word(tok):
        return "" if util.isStopWord(tok) else tok

    # Remove the stopwords from both positional list and token list
    token_list_no_stopword = list(map(remove_stop_word, token_list))

    # Stem the words
    stemmed_token_list = list(
        map(lambda tok: util.stemming(tok), token_list_no_stopword))

    return stemmed_token_list
Exemple #14
0
    def preprocessing(self, qid):
        ''' apply the same preprocessing steps used by indexing,
            also use the provided spelling corrector. Note that
            spelling corrector should be applied before stopword
            removal and stemming (why?)'''

        #ToDo: return a list of terms

        qbody = self.raw_query
        cqObj = CranFile('query.text')
        qbody = qbody.get(qid)  #self.convertFromMap(qbody)   #self.docs
        #print(qbody)
        try:
            qbody = re.sub("[^a-z0-9]+", " ", str(qbody.text))
        except Exception:
            print("Query ID which is not having text: ", qid)
            raise

        reduced = nltk.tokenize.word_tokenize(qbody)
        '''for words in reduced:
            # reduced terms are passed through stopwords and stemming in util
            if util.isStopWord(words):
                self.query.append(util.stemming(words).lower())
        # normalized terms are stored in reducedList Dictionary
        print("1...", self.query)'''

        correctedwords = [correction(word) for word in reduced]
        lowercasewords = [word.lower() for word in correctedwords]
        notstopwords = []
        for word in lowercasewords:
            if util.isStopWord(word):
                notstopwords.append(util.stemming(word))
        if len(notstopwords) > 0:
            self.query.append(notstopwords)

        print("1...", self.query)
Exemple #15
0
    def indexDoc(self, docs):  # indexing a Document object
        ''' indexing a docuemnt, using the simple SPIMI algorithm, but no need to store blocks due to the small collection we are handling. Using save/load the whole index instead'''

        # ToDo: indexing only title and body; use some functions defined in util.py
        # (1) convert to lower cases,
        # (2) remove stopwords,
        # (3) stemming

        #lower case title and body
        t = docs.title.lower()
        b = docs.body.lower()

        self.nDocs = self.nDocs + 1

        #remove numbers
        t = re.sub(r'\d+', '', t)
        b = re.sub(r'\d+', '', b)

        #tokenize
        tokenizer = RegexpTokenizer(r'\w+')
        tokens = tokenizer.tokenize(t)

        #stopword removal
        words = []
        for i in tokens:
            if util.isStopWord(i) == False:
                words.append(i)

        tokens = tokenizer.tokenize(b)
        for i in tokens:
            if util.isStopWord(i) == False:
                words.append(i)
        #stemming
        word = []
        for i in words:
            word.append(util.stemming(i))

        #to store the terms of each document as list to compute unit vector of documents
        self.dictionary[docs.docID] = []

        #add each term to the index
        for pos, i in enumerate(word):
            if i not in self.items:
                self.items[i] = IndexItem(
                    i)  # create an IndexItem object for each term
            self.items[i].add(
                docs.docID,
                pos + 1)  # add documents and positions to posting dictionary
            self.dictionary[docs.docID].append(
                i)  #adds each term to the list appearing in a document

        self.dictionary[docs.docID] = set(self.dictionary[
            docs.docID])  #removes duplicate terms from the list

        #computing tf-idf
        if self.nDocs == self.N:

            #sort the index by terms
            self.index = OrderedDict(sorted(self.items.items()))

            #compute document frequency for each term
            for term in self.index:
                self.df[term] = len(self.index[term].posting)
                self.index[term].sort()  #sort the posting by docID

            #compute the term frequency for each term in a document
            for term in self.index:
                self.tf[term] = {}
                for docc in self.index[term].sortedp:
                    if len(self.index[term].sortedp[docc].positions) == 0:
                        self.tf[term][docc] = 0
                    else:
                        #compute  (1+ log(tf))*idf
                        self.tf[term][docc] = (1 + math.log10(
                            len(self.index[term].sortedp[docc].positions))
                                               ) * self.idf(term)

            #compute the sum of squares of each term in the document and calculate square root
            sums = {}
            for d in self.dictionary:
                sums[d] = 0
                for i in self.dictionary[d]:
                    sums[d] += math.pow(self.tf[i][d], 2)
                sums[d] = math.sqrt(sums[d])

            #Divide tf*idf/ sqrt(sum of sqaures) and store in tf dictionary -> weights for each term in the documents
            for term in self.index:
                self.idfs[term] = self.idf(
                    term
                )  #compute the idf of each term in the index and store in dictionary
                for docc in self.index[term].sortedp:
                    if sums[docc] == 0:
                        self.tf[term][docc] = 0
                    else:
                        self.tf[term][docc] = self.tf[term][docc] / sums[docc]
import util

print util.isStopWord("hel")
print util.stemming("Running")

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

example_sent = "This is a sample sentence, showing off the stop words filtration."

stop_words = set(stopwords.words('english'))

word_tokens = word_tokenize(example_sent)

filtered_sentence = [w for w in word_tokens if not w in stop_words]

filtered_sentence = []

for w in word_tokens:
    if w not in stop_words:
        filtered_sentence.append(w)

print(word_tokens)
print(filtered_sentence)
Exemple #17
0
 def indexDoc(self, Pdoc):  # indexing a Document object
     ''' indexing a document, using the simple SPIMI algorithm, but no need to store blocks 
         due to the small collection we are handling. Using save/load the whole index instead'''
     # ToDo: indexing only title and body; use some functions defined in util.py
     # (1) convert to lower cases,
     Predictionary = {}
     'Tokenizing the document'
     tokenizer = RegexpTokenizer(r'\w+')
     tokens = tokenizer.tokenize(Pdoc.body)
     '''iterating over the tokens and converting it to lowercase'''
     for tokenpos, token in enumerate(tokens):
         IndexItemobj = IndexItem(token.lower())
         'checking if token is in the Predictionary'
         if (IndexItemobj.term not in Predictionary):
             # (2) remove stopwords,
             'checking the token is stop word or not.If it is stop word it will not be appended to Predictionary'
             isStop = isStopWord(IndexItemobj.term)
             if isStop == False:
                 'storing the token position of the document along with docid for the token in stem dictionary'
                 IndexItemobj.posting[int(Pdoc.docID)] = [tokenpos]
                 Predictionary[IndexItemobj.term] = IndexItemobj.posting
                 '''This below code is executed if predictionary already contains the token
                        just appened the value rather then replacing the term with new value'''
         else:
             docIDlist = Predictionary[IndexItemobj.term]
             if int(Pdoc.docID) not in docIDlist:
                 docIDlist[int(Pdoc.docID)] = [tokenpos]
             else:
                 docIDlist[int(Pdoc.docID)].append(tokenpos)
     'stemming the tokens in predictionary '
     'stem dictionary merging common terms and postion of token in a document while stemming'
     # (3) stemming
     Stemdictionary = {}
     for keytoken, values in Predictionary.items():
         stem = stemming(keytoken)
         if stem not in Stemdictionary:
             Stemdictionary[stem] = values
         else:
             stemlist = Stemdictionary[stem]
             for k in values.keys():
                 valuekey = k
             valueposition = values[valuekey]
             for v in valueposition:
                 stemlist[valuekey].append(v)
         Doclist.append(Stemdictionary)
         IndexItemobj2 = IndexItem(keytoken)
         'sorting the token positions in stemDictionary'
         IndexItemobj2.sort()
     'single pass in memory indexing'
     '''Below code builds the inverted index using SPIMI-INVERT '''
     for termdata in Doclist:
         for token, posting in termdata.items():
             for id, termpos in posting.items():
                 if token not in dictionary:
                     'add postings to dictionary if dictionary does not contain the token'
                     dictionary[token] = [posting]  # add to dictionary
                 else:
                     'get Postings list if term existing in dictionary and append it to existing term'
                     Getpostinglist = dictionary[token]
                     if posting not in Getpostinglist:
                         dictionary[token].append(
                             posting)  #add to postings list