elif sys.argv[1] == "b":
        corpus_type = "binary"
    else:
        corpus_type = "bow"

if len(sys.argv) <= 2:
    topics_count = 3
else:
    topics_count = int(sys.argv[2])

if len(sys.argv) <= 3:
    src = "pp_reuters"
else:
    src = sys.argv[3]

dtw = name.get_output_dir(corpus_type, topics_count, src)


t_1 = dtw + "/topics"
t_list1 = topics_io.read_topics(t_1)

bha_list = stl.bha_distance(t_list1, t_list1)
cos_list = stl.cos_distance(t_list1, t_list1)
kl_list = stl.kl_divergence(t_list1, t_list1)
jlist = stl.jaccard(t_list1, t_list1, 500)

kprepare = []
for topic in t_list1:
    topic.sort()
    kprepare.append(topic.list_words())
    
Example #2
0
from scipy import stats
import sys
import utils.name_convention as namecon
from similarity.SimTopicLists import SimTopicLists

tclist, tctlist, wnlist = [],[],[]

wn_names = ["path", "wup", "lch", "res", "lin", "jcn"]
for n in range(len(wn_names)):
    wnlist.append([])

for src in ["pp_reuters", "pp_brown"]:
    for corpus_type in ["tfidf", "bow", "binary"]:
        for topics_count in [5,10,15, 20]:
            dname = namecon.get_output_dir(corpus_type, topics_count, src)

            subtclist = []
            ofile = open(dname + "/top_topics_20_start0.txt")
            for line in ofile:
                if "topic" in line:
                    subtclist.append(("tc"+src+corpus_type+str(topics_count)+line.split()[1],float(line.split()[2]), int(line.split()[1])))
            subtclist = list(sorted(subtclist, key=lambda x: x[2]))
            tclist.extend(subtclist)

            subtctlist = []
            ofile = open(dname + "/top_topics_tfidf_20.txt")
            for line in ofile:
                if "topic" in line:
                    subtctlist.append(("tct"+src+corpus_type+str(topics_count)+line.split()[1],float(line.split()[2]), int(line.split()[1])))
            subtctlist = list(sorted(subtctlist, key=lambda x: x[2]))
            tctlist.extend(subtctlist)