def similar(docSet, InvIndex, word): textS = [] wordCol = [] if word not in InvIndex: print "" print "Word is not in the index." print "" return for doc in docSet: if doc in InvIndex[word]: textS = stemmer( filterData( tokenize( lowercase(parseXML(InvIndex["path_of_documents"] + "/" + "cranfield" + zfill(str(doc), 4))) ) ) ) textS = [t for t in textS if t != ""] while word in textS and len(wordCol) < 1000: index = textS.index(word) textS[index] = "" wordCol.extend(textS[(index + 1) % len(textS) : (index + 4) % len(textS)]) wordCol.extend(textS[(index - 4) % len(textS) : (index - 1) % len(textS)]) result = sorted(set(wordCol), key=wordCol.count, reverse=True) if len(result) < 10: print "" print "List of Similar terms (in stemmed form) is" print result print "" else: print "" print "List of Similar terms (in stemmed form) is" print result[:10] print ""
def MakeIndex(path): InvertedIndex = {} InvertedIndex['path_of_documents'] = path if exists(path): textFiles = [f for f in listdir(path)] else: print path,'is not a valid path, exiting...' exit() for file in textFiles: text = parseXML( path + "/" + file ) text = lowercase(text) text = tokenize(text) text = filterData(text) text = stemmer(text) InvertedIndex = invertedListAppend( text, file, InvertedIndex ) return InvertedIndex