def process_input_phrase_query(self,phrase): phrasal_word=self.process_query_removing_special_characters(phrase) new_sentence="" for word in phrasal_word: if indexutils.return_is_stop_word(word)==False: current_word=indexutils.return_stemmed_word(word) new_sentence=new_sentence+" "+current_word return new_sentence.strip()
def populate_index_hash(self): file_list="" index="" tokenCountHash={} count=0 # For every path supplied as corpus path for documentPath in self.docs: #get list of files in a particular corpus path file_list=indexutils.get_files_in_path(documentPath) # Calculate the total number of documents in the corpus path self.total_documents=self.total_documents+len(file_list) # read every file in the corpus for file in file_list: documentPaths=indexutils.construct_path(documentPath,file) documentContent=self.get_document_content(documentPaths) documentTitle="" documentId="" documentAuthor="" documentBibilio="" # TODO replace the hard coded characters # Parse the corpus and extract the text,title, author and bibiliography info documentText=str(documentContent.getElementsByTagName('TEXT')[0].firstChild.data.strip()) documentTitle=str(documentContent.getElementsByTagName('TITLE')[0].firstChild.data.strip()) documentId=str(documentContent.getElementsByTagName('DOCNO')[0].firstChild.data.strip()) documentAuthor=str(documentContent.getElementsByTagName('AUTHOR')[0].firstChild.data.strip()) documentBibilio=str(documentContent.getElementsByTagName('BIBLIO')[0].firstChild.data.strip()) documentDetails=() documentDetails=(documentId,documentTitle,documentAuthor) # Split the document into non punctuated words tokens=self.tokenize_document_text(documentText) title_token =self.tokenize_document_text(documentTitle) tokens=tokens+title_token # Each token/word returned by the tokenizer tokenCountHash={} positionCounter=0 positionList= [] # For every token got from corpus file for token in tokens: self.total_words=self.total_words+1 # introduce something here if stemming has to be done here #ignore if word is stop word if indexutils.return_is_stop_word(token): positionCounter=positionCounter self.total_stop_words=self.total_stop_words+1 else: positionCounter=positionCounter+1 token=token.strip() #else perform stemming new_word=indexutils.return_stemmed_word(token) # if the word has been actually stemmed if(new_word!=token): self.number_of_stemmed_words=self.number_of_stemmed_words+1 token=new_word #add the new word to the position list formed, i.e if a word already exists append the new #position to the position list of that word in the file being currently read if tokenCountHash.has_key(token): positionList= tokenCountHash[token] positionList [0]= int(positionList [0])+1 positionList.append(positionCounter) tokenCountHash.update({token:positionList}) else: positionList=[1] positionList.append(positionCounter) tokenCountHash.update({token:positionList}) #updating the index dictionary self.update_index_dictionaries(documentDetails,tokenCountHash) #update the document meta data dictionary self.documentIndex.update({documentId:{"path":documentPaths,"title":documentTitle,"author":documentAuthor,"bibliography":documentBibilio}})
def process_query(self,input_query): self.grouping_iteration=0 #setting this flag every time the qyery processing starts # TODO implement query caching HEADER = '\033[95m' OKBLUE = '\033[94m' OKGREEN = '\033[92m' WARNING = '\033[93m' FAIL = '\033[91m' ENDC = '\033[0m' start_time=time.time() self.index_copy=copy.deepcopy(self.index_instance.copy()) end_time=time.time() query=input_query.strip().split(" ",1) query_command=query[0] #sself.reload_index_file() if query_command=="doc": query_command_value=query[1] self.return_document_text(query_command_value) end_time=time.time() self.time_taken_to_query=end_time-start_time elif query_command=="title": query_command_value=query[1] title=self.return_document_title(query_command_value) print HEADER+title+ENDC end_time=time.time() self.time_taken_to_query=end_time-start_time elif query_command=="similar": query_command_value=query[1] if query_command_value==None: print WARNING+"Please enter the word for which you have to find similar words"+ENDC else: word_list=self.process_query_removing_special_characters(query_command_value) final_result=[] for word in word_list: result=self.process_similar_query(word) final_result=final_result+result if len(final_result)==0: print FAIL+"No similar words found with edit distance of 1"+ENDC else: for match_word in final_result: print match_word end_time=time.time() self.time_taken_to_query=end_time-start_time elif query_command=="df": query_command_value="" try: query_command_value=query[1] except IndexError: print FAIL+"Please enter the word whose document frequency is to be found"+ENDC if query_command_value: if indexutils.return_is_stop_word(query_command_value): print 0 print WARNING+"The word given is a stop word. Stop words are not indexed"+ENDC else: query_category=self.categorize_input_query(query_command_value) query_results=self.get_results_of_input_query_as_per_category(query_category) final_result=self.aggregate_query_results(query_results) end_time=time.time() self.time_taken_to_query=end_time-start_time if final_result: print len(final_result) else: print 0 else: print WARNING+"Enter a word or a phrases"+ENDC elif query_command=="tf": query=input_query.split() document_id="" term="" try: document_id=query[1] term=query[2] except IndexError: print FAIL+"Please enter the document and word whose term frequency is to be found"+ENDC if indexutils.return_is_stop_word(term): print 0 print WARNING+"The word given is a stop word"+ENDC elif self.doc_index_instance.has_key(document_id): if self.index_instance.has_key(term): if self.index_instance[term].has_key(document_id): print self.index_instance[term][document_id][0] else: print 0 print FAIL+"Word not present in document "+document_id+ENDC else: print FAIL+"Document "+document_id+ " not found"+ENDC end_time=time.time() self.time_taken_to_query=end_time-start_time elif query_command=="freq": try: query_command_value=query[1] term=query_command_value query_category=self.categorize_input_query(term) query_results=self.get_results_of_input_query_as_per_category(query_category) final_result=self.aggregate_query_results(query_results) no_of_times=0 if final_result: for results in query_results: for phrasal_queries in query_results[results]: length=len(phrasal_queries) for document in query_results[results][phrasal_queries]: no_of_times=no_of_times+int( query_results[results][phrasal_queries][document][0]) print (no_of_times/self.query_array_length) else: print FAIL+"No matching documents found for phrase "+OKGREEN+term+ENDC except IndexError: print FAIL+"Please enter the phrase whose frequency is to be found"+ENDC end_time=time.time() self.time_taken_to_query=end_time-start_time else: query_category=self.categorize_input_query(input_query) query_results=self.get_results_of_input_query_as_per_category(query_category) final_result=self.aggregate_query_results(query_results) end_time=time.time() self.time_taken_to_query=end_time-start_time if final_result: print self.print_final_results(final_result,query_results) print "\n" else: print FAIL+"No matching Results Found"+ENDC print "time taken to search: %f seconds" %self.time_taken_to_query