def fix_json(name): data = iotools.load_raw(name) lines = data.split("\n") if lines[-1] == "": del lines[-1] data = "[%s]" % ",".join(lines) iotools.save_raw(data, name) data = iotools.load_json(name) iotools.save_json(data, name)
def get_table(): matrix_new = get_similarity_new_matrix_weighted() matrix_old = get_similarity_old_matrix() cats = matrix_new.keys() ret = {} ret_text = "cat, inner-new, inner-old, outer-new, outer-old\n" for cat1 in cats: ret[cat1] = {"inner-new": 0, "outer-new": 0, "inner-old": 0, "outer-old": 0} ret["inner-new"] = matrix_new[cat1][cat1] ret["inner-old"] = matrix_old[cat1][cat1] ret["outer-new"] = sum([matrix_new[cat1][cat2] for cat2 in cats if cat2 != cat1])/(len(cats)-1) ret["outer-old"] = sum([matrix_old[cat1][cat2] for cat2 in cats if cat2 != cat1])/(len(cats)-1) ret_text += "%s, %s, %s, %s, %s\n" % (cat1, ret["inner-new"], ret["inner-old"], ret["outer-new"], ret["outer-old"]) iotools.save_raw(ret_text, 'output/similarity/table.xls')
def tag_cloud_text_new_keywords_simple(): ret = [] for dataset, keywords in iotools.load_dataset_keywords_dict().items(): ret += keywords['all'].keys() return ret def remove_tops(keyword_list, percent): counter = Counter(keyword_list) counter = [word for word, count in counter.most_common()] remove_count = len(counter) * percent / 100.0 remove_count = int(remove_count) keyword_list = filter(lambda x: x not in counter[:remove_count], keyword_list) return keyword_list if __name__ == "__main__": keywords_sets = { 'Old Keywords': tag_cloud_text_old_keywords(), 'New Keywords': tag_cloud_text_new_keywords_simple(), 'New Keywords Weighted': tag_cloud_text_new_keywords_weighted(), } for key, keywords in keywords_sets.items(): keywords_sets[key + " (Exclude top 10%)"] = remove_tops(keywords, 10) for key, keywords in keywords_sets.items(): iotools.save_raw("\n".join(keywords), 'output/tagcloud/%s.txt' % key)