#from urllib.request import urlopen, Request
import urllib2

input_json_path = "./input/gt_data.json"


json_data = {}

with open(input_json_path) as data_file:    
    input_data = json.load(data_file)

for data in input_data:
	op_json = {}

	html = urllib2.urlopen(urllib2.Request(data['id'], headers={'User-Agent' : "Magic Browser"}) ).read()
	readable_article = Document(html).summary(True)
	writable_tag_data = readable_article.encode('utf-8')

	#Save processed html files
	f = open('./data_op_readability/file_4_'+str(data['uid'])+".html",'w')
	f.write(writable_tag_data) 
	f.close()

	#get pure content
	doc = BeautifulSoup(writable_tag_data,'lxml')
	full_text = doc.get_text().encode('utf-8')
	processed_data = re.sub( '\s+', ' ', full_text).strip()
	op_json['id'],op_json['uid'],op_json['content'] = data['id'],data['uid'],processed_data
	json_data[data['uid']] = op_json

with open('./output_readability_txt/processed_data_readability_'+str(int(time.time()))+'.json', 'w') as outfile:
	"http://losangeles.backpage.com/FemaleEscorts/sexy-caramel-barbie-doll-cute-girl/63871582",
	"http://losangeles.backpage.com/FemaleEscorts/realy-new-japanese-young-girl-pretty-sweet-cozy-massage-services-6572276076/65033335",
	"http://losangeles.backpage.com/FemaleEscorts/morning-specials-beautiful-and-latina-come-see-a-and-juicy-girl-who-loves-handsome-men/65025069",
	"http://losangeles.backpage.com/FemaleEscorts/luxuryspa-lovelylatinas-40510-freeway/62359660",
	"http://losangeles.backpage.com/FemaleEscorts/sexy-belizean-godess-big-booty-caramel-all-r-e-l-come-get-your-fixutit/62497186",
	"http://losangeles.backpage.com/FemaleEscorts/sexy-belizean-godess-big-booty-caramel-all-r-e-l-come-get-your-fixutit/62268601",
	"http://losangeles.backpage.com/FemaleEscorts/b-g-213331o692-8oqv-lax/64991663",
	"http://losangeles.backpage.com/FemaleEscorts/i-can-come-to-yuh/65054337",
	"http://losangeles.backpage.com/FemaleEscorts/lax-incall-1oo-specials-h0t-asian-latina-mix/63277038",
	"http://losangeles.backpage.com/FemaleEscorts/way-2-hot-2-handle-foxxy-brazilian-looking-2-have-fun-fun-fun-and-play-play-play/49623772",
	"http://losangeles.backpage.com/FemaleEscorts/sexy-beauty-andso-hot/33317924",
	"http://losangeles.backpage.com/FemaleEscorts/open-minded-sexy-brunette-janet-outcall/33471446"
] 

html = urllib.urlopen("http://www.eroticmugshots.com/ftlauderdale-escorts/954-601-7752/?pid=36770728").read()
readable_article = Document(html).summary(True)
print readable_article
sys.exit()

i = 21
for urlex in arr:
	html = urllib.urlopen(urlex).read()
	readable_article = Document(html).summary(True)
	i += 1

	tags = '<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /></head><body>'
	data = readable_article.encode('utf-8')
	tags_end = "</body></html>"
	# soup = BeautifulSoup(data,"lxml")
	# metatag = soup.new_tag('head')
	# soup.html.insert(0,metatag)