elif sys.argv[1] == "b": corpus_type = "binary" else: corpus_type = "bow" if len(sys.argv) <= 2: topics_count = 3 else: topics_count = int(sys.argv[2]) if len(sys.argv) <= 3: src = "pp_reuters" else: src = sys.argv[3] dtw = name.get_output_dir(corpus_type, topics_count, src) t_1 = dtw + "/topics" t_list1 = topics_io.read_topics(t_1) bha_list = stl.bha_distance(t_list1, t_list1) cos_list = stl.cos_distance(t_list1, t_list1) kl_list = stl.kl_divergence(t_list1, t_list1) jlist = stl.jaccard(t_list1, t_list1, 500) kprepare = [] for topic in t_list1: topic.sort() kprepare.append(topic.list_words())
from scipy import stats import sys import utils.name_convention as namecon from similarity.SimTopicLists import SimTopicLists tclist, tctlist, wnlist = [],[],[] wn_names = ["path", "wup", "lch", "res", "lin", "jcn"] for n in range(len(wn_names)): wnlist.append([]) for src in ["pp_reuters", "pp_brown"]: for corpus_type in ["tfidf", "bow", "binary"]: for topics_count in [5,10,15, 20]: dname = namecon.get_output_dir(corpus_type, topics_count, src) subtclist = [] ofile = open(dname + "/top_topics_20_start0.txt") for line in ofile: if "topic" in line: subtclist.append(("tc"+src+corpus_type+str(topics_count)+line.split()[1],float(line.split()[2]), int(line.split()[1]))) subtclist = list(sorted(subtclist, key=lambda x: x[2])) tclist.extend(subtclist) subtctlist = [] ofile = open(dname + "/top_topics_tfidf_20.txt") for line in ofile: if "topic" in line: subtctlist.append(("tct"+src+corpus_type+str(topics_count)+line.split()[1],float(line.split()[2]), int(line.split()[1]))) subtctlist = list(sorted(subtctlist, key=lambda x: x[2])) tctlist.extend(subtctlist)