Esempio n. 1
0
def searchQueryUnion(query,dictionary):
    result=[]
    IDresult=[]
    postings=[]
    DocID=[]
    #Tokenise the query for process later
    tokens=Tokenization.tokenization(query)
    #Normalize the tokens for process later
    for index in range(len(tokens)):
        tokens[index] = Normalization.cleanStopWords150(tokens[index])
        tokens[index] = Normalization.cleanedTokens(tokens[index])
        tokens[index] = Normalization.caseFoldedTokens(tokens[index])
        tokens[index] = Normalization.stemmedTokens(tokens[index])
        if tokens[index] in dictionary:
            DocID=DocID+dictionary.get(tokens[index])
    if DocID!=[]:
        #change docID from string to int
        for index in range(len(DocID)):
            DocID[index]=DocID[index].split(',')
            DocID[index]=DocID[index][0].split()
            DocID[index]=map(int, DocID[index])
        #Getting the intersection of the DocID for different tokens in query
        DocID=[set(id) for id in DocID]
        #print DocID
        finalID=sorted(set.union(*DocID))
        return finalID
    else:
        finalID=DocID
        return finalID
def getInvertedIndexTokens(fileNum):
    invertedIndex={}
    #loading the orginal documents for tokenization and normalization later
    fileNum= '%0*d' % (3, fileNum)
    resourcepath = 'reuters/reut2-' + fileNum + '.sgm'
    file = open(resourcepath)
    soup = BeautifulSoup(file, 'html.parser')
    allReuters=soup.find_all('reuters')
    for reuters in allReuters:
        if reuters.body is not None:
            #Tokenize the text inside the [body]tag in the files 
            tokenslist=reuters.body.text.rsplit('reuters',1)
            for e in tokenslist:
                tokens=''.join(e).encode('utf8')
                tokens=Tokenization.tokenization(tokens)
                for token in tokens:
                    #Normalization after get the tokens 
                    token = Normalization.cleanedTokens(token)
                    token = Normalization.caseFoldedTokens(token)
                    token = Normalization.cleanStopWords150(token)
                    token = Normalization.stemmedTokens(token)
                    #Construct the inverted index for tokens
                    if token != '':
                        if invertedIndex.has_key(token):
                            if reuters['newid'] not in invertedIndex[token]:
                                invertedIndex[token].append(reuters['newid'])
                                
                        else:
                            invertedIndex[token] = [reuters['newid']]
                            
    return invertedIndex
def purrify(query):
    # purrify the query
    normalizedTokens = []
    tokens = Tokenization.tokenization(query)
    for token in tokens:
        token = Normalization.cleanedTokens(token)
        token = Normalization.caseFoldedTokens(token)
        token = Normalization.cleanStopWords150(token)
        token = Normalization.stemmedTokens(token)
        if token != "":
            normalizedTokens.append(token)
    #print normalizedTokens
    return normalizedTokens
Esempio n. 4
0
def getAllTokens(fileNum): 
    # Create inverted index, loop through all articles in one file   
    invertedIndex = {}
    tokensLength= open('invertedIndex/tokensLength', 'a')
    Content = open('invertedIndex/Content', 'a')
    #loading the orginal documents for tokenization and normalization later
    fileNum= '%0*d' % (3, fileNum)
    resourcepath = 'reuters/reut2-' + fileNum + '.sgm'
    file = open(resourcepath)
    soup = BeautifulSoup(file, 'html.parser')
    for doc in soup.find_all('reuters'):
        docId = int(doc['newid'].encode('utf8'))
        tokenCounter = 0
        if doc.body is not None:
            content = doc.body.text
            length=len(content)
            Content.write (str(docId) + ' Start ' + content.encode('utf8') + ' End ')
            tokens = Tokenization.tokenization(content)
            for token in tokens:
                # Normalization
                token = Normalization.cleanedTokens(token)
                token = Normalization.caseFoldedTokens(token)
                token = Normalization.cleanStopWords150(token)
                token = Normalization.stemmedTokens(token)
                if token != '':
                    tokenCounter += 1
                    # Add to the postings list if the word exists
                    if invertedIndex.has_key(token):
                        if invertedIndex[token].has_key(docId):
                            tf = invertedIndex[token][docId]
                            invertedIndex[token][docId] = tf +1
                        else:
                            invertedIndex[token][docId] = 1
                    else:
                        invertedIndex[token] = {docId:1}
        tokensLength.write (str(docId) + ':' + str(tokenCounter) +'\n')
    tokensLength.close()
    Content.close()                
    return invertedIndex