Example #1
0
    def get_position_text(self):
        """
        This method asks the user for the div ids to include, gets the content of said divs, and then
        asks the user for a percentage of said div to use
        :return: The queries generated from the text left after above reduction
        """
        text = ''
        include_answer = raw_input("Do you want to enter divs to be included or excluded? Enter i for include, "
                                   "e for exclude")
        include = True#defaults to include
        if include_answer == 'e':
            include = False

        #todo check words and percentage are ints before convert
        divs = raw_input("enter IDs of the divs separated by spaces \n")
        ids = ['wrapper']#defaults to wrapper, overwritten if divs are entered
        if divs:
            #split string into list of IDs
            ids = divs.split()
        if include:
            #set the extractor with no divs to ignore and process the page
            pce = PositionContentExtractor()
            pce.process_html_page(self.page_html)
            #now set the text of the pce to be the text from the divs with given ids
            pce.set_all_content(ids,"div")
        else:
            pce = PositionContentExtractor(div_ids=ids)
            pce.process_html_page(self.page_html)

        limit_by_words = raw_input("enter y if you want to limit by a number of words \n")
        #defaults to no if you just hit enter
        yes_vals = ["y",'Y',"Yes",'yes']
        if limit_by_words in yes_vals:
            while True:
                words = raw_input("enter the number of words to use"
                                  "in generating queries \n")
                if self.is_integer(words):
                    words = int(words)
                    text = pce.get_subtext(num_words=words)
                    break
        else:
            limit_by_percent = raw_input("enter y if you want to limit by a percentage of words \n")
            if limit_by_percent in yes_vals:
                while True:
                    percentage = raw_input("the percentage of words to use in generating queries \n")
                    if self.is_integer(percentage):
                        percentage = int(percentage)
                        text = pce.get_subtext(percentage=percentage)
                        break
            else:
                text = pce.get_subtext()
        return text
Example #2
0
 def reduce_page(self, percentage):
     """
     this method reduces the whole page content to a percentage of the content
     :param percentage: the percentage of the page to be used for generating queries
     :return: the reduced page content as a string
     """
     pce = PositionContentExtractor()
     pce.process_html_page(self.page_html)
     return pce.get_subtext(percentage=percentage)
Example #3
0
 def get_position_text(self):
     pce = PositionContentExtractor()
     pce.process_html_page(self.page_html)
     #now set the text of the pce to be the text from the divs with given ids
     self.set_divs()
     #set the content to be that of the divs if there are any
     if self.divs:
         pce.set_all_content(self.divs,"div")
     else:
         print "no divs, in default **" #todo checking no divs works
     #now check if to limit by words
     text =''
     if self.doc_portion_count:
         if self.is_integer(self.doc_portion_count):
             words = int(self.doc_portion_count)
             text = pce.get_subtext(num_words=words)
     elif self.doc_portion_percent:
         if self.is_integer(self.doc_portion_percent):
             percentage = int(self.doc_portion_percent)
             text = pce.get_subtext(percentage=percentage)
     else:
         text = pce.get_subtext()
     return text
Example #4
0
 def get_position_text(self):
     pce = PositionContentExtractor()
     pce.process_html_page(self.page_html)
     # now set the text of the pce to be the text from the divs with given ids
     self.set_divs()
     # set the content to be that of the divs if there are any
     if self.divs:
         pce.set_all_content(self.divs, "div")
     else:
         print "no divs, in default **"  # todo checking no divs works
     # now check if to limit by words
     text = ""
     if self.doc_portion_count:
         if self.is_integer(self.doc_portion_count):
             words = int(self.doc_portion_count)
             text = pce.get_subtext(num_words=words)
     elif self.doc_portion_percent:
         if self.is_integer(self.doc_portion_percent):
             percentage = int(self.doc_portion_percent)
             text = pce.get_subtext(percentage=percentage)
     else:
         text = pce.get_subtext()
     return text