def similar(docSet, InvIndex, word):
    textS = []
    wordCol = []
    if word not in InvIndex:
        print ""
        print "Word is not in the index."
        print ""
        return
    for doc in docSet:
        if doc in InvIndex[word]:
            textS = stemmer(
                filterData(
                    tokenize(
                        lowercase(parseXML(InvIndex["path_of_documents"] + "/" + "cranfield" + zfill(str(doc), 4)))
                    )
                )
            )
            textS = [t for t in textS if t != ""]
            while word in textS and len(wordCol) < 1000:
                index = textS.index(word)
                textS[index] = ""
                wordCol.extend(textS[(index + 1) % len(textS) : (index + 4) % len(textS)])
                wordCol.extend(textS[(index - 4) % len(textS) : (index - 1) % len(textS)])
    result = sorted(set(wordCol), key=wordCol.count, reverse=True)
    if len(result) < 10:
        print ""
        print "List of Similar terms (in stemmed form) is"
        print result
        print ""
    else:
        print ""
        print "List of Similar terms (in stemmed form) is"
        print result[:10]
        print ""
def MakeIndex(path):
    InvertedIndex = {}
    InvertedIndex['path_of_documents'] = path
    if exists(path):
        textFiles = [f for f in listdir(path)]
    else:
        print path,'is not a valid path, exiting...'
        exit()
    for file in textFiles:
        text = parseXML( path + "/" + file )
        text = lowercase(text)
        text = tokenize(text)
        text = filterData(text)
        text = stemmer(text)
        InvertedIndex = invertedListAppend( text, file, InvertedIndex )    
    return InvertedIndex
def phraseSearch(phrase, wordSet, docSet, InvIndex):
    words = phrase.strip().strip('"').split()
    words = stemmer(words)
    phraseLen = len(words)
    if not set(words).issubset(wordSet):
        return {}
    if len(words) == 0:
        return {}
    word1 = words[0]
    words.remove(word1)
    mergeList1 = InvIndex[word1]
    while words:
        word2 = words[0]
        words.remove(word2)
        mergeList2 = InvIndex[word2]
        mergeList1 = phraseMerge(mergeList1, mergeList2)
    for keys, values in mergeList1.items():
        mergeList1[keys] = phraseLen * len(values)
    return mergeList1
Beispiel #4
0
from Document import lowercase, tokenize, filterData, printDocument, getSnippets, printDoc, printTitle, stemmer

    
if __name__ == '__main__':
    #wordset is the set of all unique words in index, docset is the set of all documents, invl is the inverted list
    wordSet,docSet,invl = preprocess()
    while True:
        print 'Enter search string: (Empty string to exit)'        
        searchString = raw_input()
        searchString = searchString.strip()
                
        if searchString == '':
            break
        #seperate the individual words or phrases in the searchString
        parsedSearch = parseSearchString(searchString)    
        parsedSearch = stemmer(parsedSearch)
        searchString = ' '.join(parsedSearch)
        #Checking for the type of query desired by the user
        if parsedSearch[0] == 'df':
            print ''
            print 'Document Frequency of \'',searchString[3:],'\' is', df(searchString[3:], wordSet, docSet, invl)
            print ''
            continue
        elif parsedSearch[0] == 'tf':
            newSearch = ' '.join(parsedSearch[2:])
            if len(parsedSearch) == 1:
                print ''
                print 'Please name of document after tf.....'
                print ''
                continue
            print ''