def main(source): list = {} datafile = open(source, "r") sentences = regexhtml.article_clean(datafile.read()) if "." in sentences: sentences = re.sub(r".", r" ", sentences) for word in sentences.split(): if not word in list: list.update({word : 1}) else: list.update({word : list[word] + 1}) list = OrderedDict(sorted(list.items(), key=lambda x:x[1])) file = open(source + "-analytics.dat", "w") for word in reversed(list): file.write(word + ": " + str(list[word]) + "\n") file.close()
def main(): list = {} article_dir = "./article/" for file in os.listdir(article_dir): if file.endswith(".txt"): title_file = open(article_dir + file, "r").read() dat_file = open(article_dir + file + "-cloud.txt", "w") file_title = re.findall("<title>(.*?)</title>", title_file) print file_title[0] dat_file.write(file_title[0] + "\n\n") cache = regexhtml.article_clean(title_file) for title in file_title[0].replace("\"", "").replace(":", "").split(): # print title dat_file.write("\n" + title + "\n") for sentence in cache.split("."): if title in sentence: # print sentence + ".\n" dat_file.write(sentence + ".\n")