def process_input_phrase_query(self,phrase):
        phrasal_word=self.process_query_removing_special_characters(phrase)
        new_sentence=""
        for word in  phrasal_word:

               if indexutils.return_is_stop_word(word)==False:
                 current_word=indexutils.return_stemmed_word(word)
                 new_sentence=new_sentence+" "+current_word
        return new_sentence.strip()
Ejemplo n.º 2
0
  def populate_index_hash(self):
      file_list=""
      index=""
      tokenCountHash={}
      count=0
      # For every path supplied as corpus path
      for documentPath in self.docs:

          #get list of files in a particular corpus path
          file_list=indexutils.get_files_in_path(documentPath)
          # Calculate the total number of documents in the corpus path
          self.total_documents=self.total_documents+len(file_list)
          # read every file in the corpus
          for file in file_list:


              documentPaths=indexutils.construct_path(documentPath,file)
              documentContent=self.get_document_content(documentPaths)
              documentTitle=""
              documentId=""
              documentAuthor=""
              documentBibilio=""
              # TODO replace the hard coded characters
              # Parse the corpus and extract the text,title, author and bibiliography info
              documentText=str(documentContent.getElementsByTagName('TEXT')[0].firstChild.data.strip())
              documentTitle=str(documentContent.getElementsByTagName('TITLE')[0].firstChild.data.strip())
              documentId=str(documentContent.getElementsByTagName('DOCNO')[0].firstChild.data.strip())
              documentAuthor=str(documentContent.getElementsByTagName('AUTHOR')[0].firstChild.data.strip())
              documentBibilio=str(documentContent.getElementsByTagName('BIBLIO')[0].firstChild.data.strip())

              documentDetails=()
              documentDetails=(documentId,documentTitle,documentAuthor)
              # Split the document into non punctuated  words
              tokens=self.tokenize_document_text(documentText)
              title_token =self.tokenize_document_text(documentTitle)
              tokens=tokens+title_token
              # Each token/word returned by the tokenizer
              tokenCountHash={}
              positionCounter=0
              positionList= []

              # For every token got from corpus file
              for token in tokens:


                self.total_words=self.total_words+1
                  # introduce something here if stemming has to be done here
                #ignore if word is stop word
                if indexutils.return_is_stop_word(token):
                  positionCounter=positionCounter
                  self.total_stop_words=self.total_stop_words+1
                else:
                  positionCounter=positionCounter+1
                  token=token.strip()
                  #else perform stemming
                  new_word=indexutils.return_stemmed_word(token)
                 # if the word has been actually stemmed
                  if(new_word!=token):
                      self.number_of_stemmed_words=self.number_of_stemmed_words+1
                  token=new_word

                  #add the new word to the position list formed, i.e if a word already exists append the new
                  #position to the position list of that word in the file being currently read
                  if tokenCountHash.has_key(token):

                     positionList= tokenCountHash[token]
                     positionList [0]= int(positionList [0])+1
                     positionList.append(positionCounter)
                     tokenCountHash.update({token:positionList})
                  else:

                     positionList=[1]
                     positionList.append(positionCounter)
                     tokenCountHash.update({token:positionList})

              #updating the index dictionary
              self.update_index_dictionaries(documentDetails,tokenCountHash)
              #update the document meta data dictionary
              self.documentIndex.update({documentId:{"path":documentPaths,"title":documentTitle,"author":documentAuthor,"bibliography":documentBibilio}})
    def process_query(self,input_query):
        self.grouping_iteration=0 #setting this flag every time the qyery processing starts
        # TODO implement query caching

        HEADER = '\033[95m'
        OKBLUE = '\033[94m'
        OKGREEN = '\033[92m'
        WARNING = '\033[93m'
        FAIL = '\033[91m'
        ENDC = '\033[0m'

        start_time=time.time()
        self.index_copy=copy.deepcopy(self.index_instance.copy())
        end_time=time.time()
        query=input_query.strip().split(" ",1)
        query_command=query[0]
        #sself.reload_index_file()

        if  query_command=="doc":
            query_command_value=query[1]
            self.return_document_text(query_command_value)
            end_time=time.time()
            self.time_taken_to_query=end_time-start_time

        elif query_command=="title":
             query_command_value=query[1]
             title=self.return_document_title(query_command_value)
             print HEADER+title+ENDC
             end_time=time.time()
             self.time_taken_to_query=end_time-start_time

        elif query_command=="similar":
             query_command_value=query[1]
             if query_command_value==None:
                print WARNING+"Please enter  the word for which you have to find similar words"+ENDC
             else:

                 word_list=self.process_query_removing_special_characters(query_command_value)
                 final_result=[]
                 for word in word_list:
                     result=self.process_similar_query(word)
                     final_result=final_result+result

                 if len(final_result)==0:
                    print FAIL+"No similar words found with edit distance of 1"+ENDC
                 else:
                    for match_word in final_result:
                        print match_word
             end_time=time.time()
             self.time_taken_to_query=end_time-start_time
        elif query_command=="df":
            query_command_value=""
            try:
              query_command_value=query[1]
            except IndexError:
                print FAIL+"Please enter the word whose document frequency is to be found"+ENDC


            if  query_command_value:

                if indexutils.return_is_stop_word(query_command_value):
                   print 0
                   print WARNING+"The word given is a stop word. Stop words are not indexed"+ENDC
                else:

                   query_category=self.categorize_input_query(query_command_value)
                   query_results=self.get_results_of_input_query_as_per_category(query_category)
                   final_result=self.aggregate_query_results(query_results)
                   end_time=time.time()
                   self.time_taken_to_query=end_time-start_time
                   if final_result:
                      print len(final_result)
                   else:
                      print 0
            else:
                print WARNING+"Enter a word or a phrases"+ENDC

        elif query_command=="tf":
            query=input_query.split()
            document_id=""

            term=""
            try:
              document_id=query[1]
              term=query[2]
            except IndexError:
                print FAIL+"Please enter the document and word whose term frequency is to be found"+ENDC

            if indexutils.return_is_stop_word(term):
                   print 0
                   print WARNING+"The word given is a stop word"+ENDC
            elif self.doc_index_instance.has_key(document_id):
              if self.index_instance.has_key(term):
                 if self.index_instance[term].has_key(document_id):
                    print self.index_instance[term][document_id][0]
                 else:
                     print 0
                     print FAIL+"Word not present in document "+document_id+ENDC
            else:
                 print FAIL+"Document "+document_id+ " not found"+ENDC
            end_time=time.time()
            self.time_taken_to_query=end_time-start_time
        elif query_command=="freq":
            try:
              query_command_value=query[1]
              term=query_command_value
              query_category=self.categorize_input_query(term)
              query_results=self.get_results_of_input_query_as_per_category(query_category)
              final_result=self.aggregate_query_results(query_results)
              no_of_times=0

              if final_result:
                 for results in query_results:
                     for phrasal_queries in query_results[results]:

                         length=len(phrasal_queries)
                         for document in query_results[results][phrasal_queries]:

                              no_of_times=no_of_times+int( query_results[results][phrasal_queries][document][0])


                 print (no_of_times/self.query_array_length)
              else:
                 print FAIL+"No matching documents found for phrase "+OKGREEN+term+ENDC

            except IndexError:
                print FAIL+"Please enter the phrase whose frequency is to be found"+ENDC

            end_time=time.time()
            self.time_taken_to_query=end_time-start_time
        else:
            query_category=self.categorize_input_query(input_query)
            query_results=self.get_results_of_input_query_as_per_category(query_category)
            final_result=self.aggregate_query_results(query_results)
            end_time=time.time()
            self.time_taken_to_query=end_time-start_time
            if final_result:
               print self.print_final_results(final_result,query_results)
               print "\n"
            else:
               print FAIL+"No matching Results Found"+ENDC

        print "time taken to  search: %f seconds" %self.time_taken_to_query