def searchQueryUnion(query,dictionary): result=[] IDresult=[] postings=[] DocID=[] #Tokenise the query for process later tokens=Tokenization.tokenization(query) #Normalize the tokens for process later for index in range(len(tokens)): tokens[index] = Normalization.cleanStopWords150(tokens[index]) tokens[index] = Normalization.cleanedTokens(tokens[index]) tokens[index] = Normalization.caseFoldedTokens(tokens[index]) tokens[index] = Normalization.stemmedTokens(tokens[index]) if tokens[index] in dictionary: DocID=DocID+dictionary.get(tokens[index]) if DocID!=[]: #change docID from string to int for index in range(len(DocID)): DocID[index]=DocID[index].split(',') DocID[index]=DocID[index][0].split() DocID[index]=map(int, DocID[index]) #Getting the intersection of the DocID for different tokens in query DocID=[set(id) for id in DocID] #print DocID finalID=sorted(set.union(*DocID)) return finalID else: finalID=DocID return finalID
def getInvertedIndexTokens(fileNum): invertedIndex={} #loading the orginal documents for tokenization and normalization later fileNum= '%0*d' % (3, fileNum) resourcepath = 'reuters/reut2-' + fileNum + '.sgm' file = open(resourcepath) soup = BeautifulSoup(file, 'html.parser') allReuters=soup.find_all('reuters') for reuters in allReuters: if reuters.body is not None: #Tokenize the text inside the [body]tag in the files tokenslist=reuters.body.text.rsplit('reuters',1) for e in tokenslist: tokens=''.join(e).encode('utf8') tokens=Tokenization.tokenization(tokens) for token in tokens: #Normalization after get the tokens token = Normalization.cleanedTokens(token) token = Normalization.caseFoldedTokens(token) token = Normalization.cleanStopWords150(token) token = Normalization.stemmedTokens(token) #Construct the inverted index for tokens if token != '': if invertedIndex.has_key(token): if reuters['newid'] not in invertedIndex[token]: invertedIndex[token].append(reuters['newid']) else: invertedIndex[token] = [reuters['newid']] return invertedIndex
def purrify(query): # purrify the query normalizedTokens = [] tokens = Tokenization.tokenization(query) for token in tokens: token = Normalization.cleanedTokens(token) token = Normalization.caseFoldedTokens(token) token = Normalization.cleanStopWords150(token) token = Normalization.stemmedTokens(token) if token != "": normalizedTokens.append(token) #print normalizedTokens return normalizedTokens
def getAllTokens(fileNum): # Create inverted index, loop through all articles in one file invertedIndex = {} tokensLength= open('invertedIndex/tokensLength', 'a') Content = open('invertedIndex/Content', 'a') #loading the orginal documents for tokenization and normalization later fileNum= '%0*d' % (3, fileNum) resourcepath = 'reuters/reut2-' + fileNum + '.sgm' file = open(resourcepath) soup = BeautifulSoup(file, 'html.parser') for doc in soup.find_all('reuters'): docId = int(doc['newid'].encode('utf8')) tokenCounter = 0 if doc.body is not None: content = doc.body.text length=len(content) Content.write (str(docId) + ' Start ' + content.encode('utf8') + ' End ') tokens = Tokenization.tokenization(content) for token in tokens: # Normalization token = Normalization.cleanedTokens(token) token = Normalization.caseFoldedTokens(token) token = Normalization.cleanStopWords150(token) token = Normalization.stemmedTokens(token) if token != '': tokenCounter += 1 # Add to the postings list if the word exists if invertedIndex.has_key(token): if invertedIndex[token].has_key(docId): tf = invertedIndex[token][docId] invertedIndex[token][docId] = tf +1 else: invertedIndex[token][docId] = 1 else: invertedIndex[token] = {docId:1} tokensLength.write (str(docId) + ':' + str(tokenCounter) +'\n') tokensLength.close() Content.close() return invertedIndex