import os import nltk import codecs import sys from bs4 import BeautifulSoup import lxml from lxml.html.clean import Cleaner import re from cStringIO import StringIO import unicodedata reload(sys) sys.setdefaultencoding('utf8') cleaner = Cleaner() cleaner.script = True # This is True because we want to activate the javascript filter cleaner.style = True cleaner.kill_tags = ['a', 'img', 'href'] cleaner.remove_tags = ['div', 'span', 'li'] directory1 = "C:\Users\Satanu\html_test\\" directory2 = "C:\Users\Satanu\text\\" for filename in os.listdir(directory1): to_write = [] html = codecs.open(directory1 + filename, 'r', 'utf-8') raw = lxml.html.tostring( cleaner.clean_html(lxml.html.parse(directory1 + filename))) name = filename.strip('html') text = codecs.open(directory2 + filename, 'w', 'utf-8') text.write(raw)