class Index_System: "an inverted index structure with hash table as dictionary and simple list as docID list" root, inverted_index, docList = None, None, [] stop_words=["a","an","and","are","as","at","be","by","for","from","has","he","in","is","it","its","of","on","that","the","to","was","were","will","with"] def __init__(self): #constructor self.inverted_index = Inverted_Index() self.docList = [] # TODO: def normalize(word):pass def check_stopword(self,word): for stop_word in self.stop_words: if(stop_word ==word): return True return False def create(self,folder_path,index_path): "create inverted index from all text files in path" # clean old inverted index and docList in memory if(not self.root == None): self.inverted_index.clean(self.root) self.inverted_index = Inverted_Index() self.root = None self.docList[:] =[] # check whether folder does exist? if(not os.path.isdir(folder_path)): return False cwd = os.getcwd() os.chdir(folder_path) docCount=0 for file in glob.glob("*"): fp = open(file) self.docList.append((docCount,file)) #print(self.docList[docCount]) # find max term count max_tc = 0 for i,line in enumerate(fp): for word in line.split(): if(self.check_stopword(word)):continue if(self.root==None): self.root = self.inverted_index.createNode(word,self.docList[docCount]) self.inverted_index.setRoot(self.root) else: self.inverted_index.insert(self.root,word,self.docList[docCount]) fp.close() docCount=docCount+1 self.inverted_index.setDocCount(docCount) os.chdir(cwd) with open(index_path,'wb') as index_file: pickle.dump(self.inverted_index,index_file) return True def show(self): "show the inverted index data" print("All inverted index data is as below:") print("--------------- Inverted Index ----------------") self.inverted_index.printTree(self.root) print("-----------------------------------------------") print()