def parse(self, url, file_type, file_content): # Parse the file as a HTML file. # Reference from: https://stackoverflow.com/questions # 30565404/remove-all-style-scripts-and-html-tags-from-an-html-page text = file_content title = '' if 'html' in file_type: # Clean the file. Don't save HTML markup soup = BeautifulSoup(file_content, 'html.parser') # Remove all javascript and stylesheet code. for script in soup(["script", "style"]): script.extract() title = soup.title.string # Get the title of this file. # print("The title of this file is: ", title) text = soup.body.get_text() # Get the body of this file. lines = (line.strip() for line in text.splitlines()) # Build a chunk of tokens. chunks = [] for line in lines: for phrase in line.split(" "): # Split with space. chunks.append(phrase.strip()) # Drop blank lines. text = '\n'.join(chunk for chunk in chunks if chunk) # Write to a file. self.doc_id += 1 filename = "Doc#" + str(self.doc_id) + '.txt' # Ensure the file will closed. with open(filename, 'w', encoding='utf-8') as f: f.write(text) # I only give id to document I'm gonna parse. document = Document(url, self.doc_id, filename, file_type, self.stop_words) document.filter() document.stem() document.collection() # print("There're", len(document.term), "terms in document", filename) if 'html' in file_type: document.set_title(title) # Duplicate Detection for d in self.docs: if self.duplicate_detection(d, document) == 1: # print("The content of Doc#{} is exact duplicate with Doc#{}, so, we won't parse Doc#{}." # .format(document.get_id(), d.get_id(), document.get_id())) self.url_already_seen = self.url_already_seen.union( {str(document.get_url())}) return False self.docs.append(document) return True