import os
import nltk
import codecs
import sys
from bs4 import BeautifulSoup
import lxml
from lxml.html.clean import Cleaner
import re
from cStringIO import StringIO
import unicodedata

reload(sys)
sys.setdefaultencoding('utf8')
cleaner = Cleaner()
cleaner.script = True  # This is True because we want to activate the javascript filter
cleaner.style = True
cleaner.kill_tags = ['a', 'img', 'href']
cleaner.remove_tags = ['div', 'span', 'li']

directory1 = "C:\Users\Satanu\html_test\\"
directory2 = "C:\Users\Satanu\text\\"
for filename in os.listdir(directory1):
    to_write = []
    html = codecs.open(directory1 + filename, 'r', 'utf-8')
    raw = lxml.html.tostring(
        cleaner.clean_html(lxml.html.parse(directory1 + filename)))
    name = filename.strip('html')

    text = codecs.open(directory2 + filename, 'w', 'utf-8')

    text.write(raw)