Example #1
0
def fix_json(name):
    data = iotools.load_raw(name)
    lines = data.split("\n")
    if lines[-1] == "":
        del lines[-1]
    data = "[%s]" % ",".join(lines)
    iotools.save_raw(data, name)

    data = iotools.load_json(name)
    iotools.save_json(data, name)
Example #2
0
def get_table():
    matrix_new = get_similarity_new_matrix_weighted()
    matrix_old = get_similarity_old_matrix()
    cats = matrix_new.keys()

    ret = {}
    ret_text = "cat, inner-new, inner-old, outer-new, outer-old\n"

    for cat1 in cats:
        ret[cat1] = {"inner-new": 0, "outer-new": 0, "inner-old": 0, "outer-old": 0}
        ret["inner-new"] = matrix_new[cat1][cat1]
        ret["inner-old"] = matrix_old[cat1][cat1]
        ret["outer-new"] = sum([matrix_new[cat1][cat2] for cat2 in cats if cat2 != cat1])/(len(cats)-1)
        ret["outer-old"] = sum([matrix_old[cat1][cat2] for cat2 in cats if cat2 != cat1])/(len(cats)-1)

        ret_text += "%s, %s, %s, %s, %s\n" % (cat1, ret["inner-new"], ret["inner-old"], ret["outer-new"], ret["outer-old"])

    iotools.save_raw(ret_text, 'output/similarity/table.xls')
Example #3
0
def tag_cloud_text_new_keywords_simple():
    ret = []
    for dataset, keywords in iotools.load_dataset_keywords_dict().items():
        ret += keywords['all'].keys()
    return ret


def remove_tops(keyword_list, percent):
    counter = Counter(keyword_list)
    counter = [word for word, count in counter.most_common()]

    remove_count = len(counter) * percent / 100.0
    remove_count = int(remove_count)
    keyword_list = filter(lambda x: x not in counter[:remove_count], keyword_list)

    return keyword_list


if __name__ == "__main__":
    keywords_sets = {
        'Old Keywords': tag_cloud_text_old_keywords(),
        'New Keywords': tag_cloud_text_new_keywords_simple(),
        'New Keywords Weighted': tag_cloud_text_new_keywords_weighted(),
    }

    for key, keywords in keywords_sets.items():
        keywords_sets[key + " (Exclude top 10%)"] = remove_tops(keywords, 10)

    for key, keywords in keywords_sets.items():
        iotools.save_raw("\n".join(keywords), 'output/tagcloud/%s.txt' % key)