def similar(docSet, InvIndex, word): textS = [] wordCol = [] if word not in InvIndex: print "" print "Word is not in the index." print "" return for doc in docSet: if doc in InvIndex[word]: textS = stemmer( filterData( tokenize( lowercase(parseXML(InvIndex["path_of_documents"] + "/" + "cranfield" + zfill(str(doc), 4))) ) ) ) textS = [t for t in textS if t != ""] while word in textS and len(wordCol) < 1000: index = textS.index(word) textS[index] = "" wordCol.extend(textS[(index + 1) % len(textS) : (index + 4) % len(textS)]) wordCol.extend(textS[(index - 4) % len(textS) : (index - 1) % len(textS)]) result = sorted(set(wordCol), key=wordCol.count, reverse=True) if len(result) < 10: print "" print "List of Similar terms (in stemmed form) is" print result print "" else: print "" print "List of Similar terms (in stemmed form) is" print result[:10] print ""
def MakeIndex(path): InvertedIndex = {} InvertedIndex['path_of_documents'] = path if exists(path): textFiles = [f for f in listdir(path)] else: print path,'is not a valid path, exiting...' exit() for file in textFiles: text = parseXML( path + "/" + file ) text = lowercase(text) text = tokenize(text) text = filterData(text) text = stemmer(text) InvertedIndex = invertedListAppend( text, file, InvertedIndex ) return InvertedIndex
def phraseSearch(phrase, wordSet, docSet, InvIndex): words = phrase.strip().strip('"').split() words = stemmer(words) phraseLen = len(words) if not set(words).issubset(wordSet): return {} if len(words) == 0: return {} word1 = words[0] words.remove(word1) mergeList1 = InvIndex[word1] while words: word2 = words[0] words.remove(word2) mergeList2 = InvIndex[word2] mergeList1 = phraseMerge(mergeList1, mergeList2) for keys, values in mergeList1.items(): mergeList1[keys] = phraseLen * len(values) return mergeList1
from Document import lowercase, tokenize, filterData, printDocument, getSnippets, printDoc, printTitle, stemmer if __name__ == '__main__': #wordset is the set of all unique words in index, docset is the set of all documents, invl is the inverted list wordSet,docSet,invl = preprocess() while True: print 'Enter search string: (Empty string to exit)' searchString = raw_input() searchString = searchString.strip() if searchString == '': break #seperate the individual words or phrases in the searchString parsedSearch = parseSearchString(searchString) parsedSearch = stemmer(parsedSearch) searchString = ' '.join(parsedSearch) #Checking for the type of query desired by the user if parsedSearch[0] == 'df': print '' print 'Document Frequency of \'',searchString[3:],'\' is', df(searchString[3:], wordSet, docSet, invl) print '' continue elif parsedSearch[0] == 'tf': newSearch = ' '.join(parsedSearch[2:]) if len(parsedSearch) == 1: print '' print 'Please name of document after tf.....' print '' continue print ''