Esempio n. 1
0
    def parse(self, url, file_type, file_content):
        # Parse the file as a HTML file.
        # Reference from: https://stackoverflow.com/questions
        #   30565404/remove-all-style-scripts-and-html-tags-from-an-html-page
        text = file_content
        title = ''
        if 'html' in file_type:
            # Clean the file. Don't save HTML markup
            soup = BeautifulSoup(file_content, 'html.parser')
            # Remove all javascript and stylesheet code.
            for script in soup(["script", "style"]):
                script.extract()

            title = soup.title.string  # Get the title of this file.
            # print("The title of this file is: ", title)
            text = soup.body.get_text()  # Get the body of this file.

        lines = (line.strip() for line in text.splitlines())
        # Build a chunk of tokens.
        chunks = []
        for line in lines:
            for phrase in line.split(" "):  # Split with space.
                chunks.append(phrase.strip())
        # Drop blank lines.
        text = '\n'.join(chunk for chunk in chunks if chunk)

        # Write to a file.
        self.doc_id += 1
        filename = "Doc#" + str(self.doc_id) + '.txt'
        # Ensure the file will closed.
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(text)

        # I only give id to document I'm gonna parse.
        document = Document(url, self.doc_id, filename, file_type,
                            self.stop_words)
        document.filter()
        document.stem()
        document.collection()
        # print("There're", len(document.term), "terms in document", filename)

        if 'html' in file_type:
            document.set_title(title)

        # Duplicate Detection
        for d in self.docs:
            if self.duplicate_detection(d, document) == 1:
                # print("The content of Doc#{} is exact duplicate with Doc#{}, so, we won't parse Doc#{}."
                #       .format(document.get_id(), d.get_id(), document.get_id()))
                self.url_already_seen = self.url_already_seen.union(
                    {str(document.get_url())})
                return False
        self.docs.append(document)
        return True