Ejemplo n.º 1
0
class Pepper(object):
    """
    The Pepper Pots of UI (Public Relations) for Tony Stark. Handles the user
    inputting queries, parsing the queries, and returning results from the
    indexed corpus by Ironman
    """

    def __init__(self, documents, NDC, stop_words):
        super(Pepper, self).__init__()
        self.documents = documents
        self.NDC = NDC
        self.p = PorterStemmer()
        self.stop_words = stop_words

    def handleQuery(self, user_input):
        """
        Handles the process of formatting a user_inputted query
        """
        scores = []
        stem_query = self.p.stemText(user_input, self.stop_words).encode('utf_8', 'ignore')
        query = Document(stem_query, full_text=user_input)
        self.NDC.normalize(query)
        for document in self.documents:
            scores.append((self.NDC.score(query, document), document))
        scores = sorted(scores, reverse=True)
        return scores

    def score(query, document):
        return 1
Ejemplo n.º 2
0
class Parser(object):
    """
    The parsing workhorse of the entire project.
    """

    def __init__(self, stop_words, **kwargs):
        """
        The constructor for the Parser object.

        @stop_words could be one a list of stop words, or None
        """
        super(Parser, self).__init__()
        # Checks if stop_words is a list
        if stop_words is not None:
            self.stop_words = []
            for word in stop_words:
                self.stop_words.append(word.lower())
        else:
            self.stop_words = None
        self.hashes = []
        self.documents = []
        self.num_duplicates = 0
        self.p = PorterStemmer()

    def retrieveText(self, page_soup, url):
        """
        Retrieves all the non-markup text from a webpage that
        has already been crawled.

        @page_soup: The soupified version of a webpage
        """
        # Retrieve all the text of the page minus the html tags
        page_text = page_soup.get_text()
        # Stems and returns all the non-stopword text
        stem_text = self.p.stemText(page_text, self.stop_words).encode('utf_8', 'ignore')
        # Create a hash to make sure there are no 100% duplicates in the pages
        # The hex digest will also be used as the document ID, since they will
        # be unique unless they are a duplicate
        h = hashlib.md5()
        h.update(stem_text)
        page_hash = h.hexdigest()
        # If the page is not a duplicate, add the hash to a list of found
        # hashes, and create a Document object to keep track of the information
        # for each Document
        if page_hash not in self.hashes:
            self.hashes.append(page_hash)
            self.documents.append(Document(stem_text, page_text, url, page_hash))
        else:
            self.num_duplicates += 1