Esempio n. 1
0
    def run(self):
        # Find links on first run
        self.get_links(self.parse_webpage(self.base_url))

        # Calculate and save the word count in the document
        while self.links_to_search:
            url = heapq.heappop(self.links_to_search) # get next url in line
            heapq.heappush(self.links_searched, url) # save url as already searched
            soup = self.parse_webpage(self.base_url + url) # get webpage
            self.get_links(soup) # harvest urls to search
            clean = self.clean_text(soup) # clean webpage content
            doc = Document(url)
            doc.count_words(clean) # count number of words on webpage
            doc.save_word_count(self.base_url) # save document
            if len(self.links_searched) == 10000:  # Caps the result at X pages, for test purposes
                break
Esempio n. 2
0
def create_new_document(content):
    doc = Document("doc")
    doc.dictionary = doc.count_words(content)
    return doc