Exemple #1
0
from IoHelper import IoHelper
from WebHelper import WebHelper
from NltkHelper import NltkHelper

#reading the url supplied as command line argument
url = str(sys.argv[1])

#reading the count supplied as command line argument
try:
	count = int(sys.argv[2])
except Exception as e:
	count = 10


#print (url)
if not Helpers.urlValidator(url):
	print ("URL entered is not valid")
	sys.exit()

webPage = WebHelper(url)

#parsing readable text from the html page below
txt = webPage.text_from_html().lower()
#print (txt)

# Optional Step - writing to a file as temporary backup for troubleshooting
io = IoHelper("output.txt")
io.fileWriter(txt)

#Text cleansing logic below
tokenizedTxt = NltkHelper.txtTokenizer("txt",txt)