Example #1
0
def main(source):
	list = {}

	datafile = open(source, "r")
	sentences = regexhtml.article_clean(datafile.read())
	if "." in sentences: sentences = re.sub(r".", r" ", sentences)


	for word in sentences.split():
		if not word in list:
			list.update({word : 1})
		else:
			list.update({word : list[word] + 1})

	list = OrderedDict(sorted(list.items(), key=lambda x:x[1]))

	file = open(source + "-analytics.dat", "w")

	for word in reversed(list):
		file.write(word + ": " + str(list[word]) + "\n")

	file.close()
Example #2
0
def main():
	list = {}
	article_dir = "./article/"

	for file in os.listdir(article_dir):
		if file.endswith(".txt"):
			title_file = open(article_dir + file, "r").read()
			dat_file   = open(article_dir + file + "-cloud.txt", "w")

			file_title = re.findall("<title>(.*?)</title>", title_file)

			print file_title[0]
			dat_file.write(file_title[0] + "\n\n")

			cache = regexhtml.article_clean(title_file)

			for title in file_title[0].replace("\"", "").replace(":", "").split():
				# print title
				dat_file.write("\n" + title + "\n")
				for sentence in cache.split("."):
					if title in sentence:
						# print sentence + ".\n"
						dat_file.write(sentence + ".\n")