def getLdaSclusterResultForm(prefix, lda_k, scluster_k): f = open('server/static/data/lda/%s/%s/final.other' % (prefix, lda_k), 'r') alpha = float(f.read().split('\n')[-2].split()[1]) f.close() f = open('server/static/data/lda/%s/%s/final.gamma' % (prefix, lda_k), 'r') topic_scores = map(lambda x: normalize(map(lambda y: float(y)-alpha, x.split())), f.read().split('\n')[:-1]) f.close() f = open('data/%s/data.index' % (prefix), 'r') indices = f.read().split() f.close() f = open('data/%s/data.out.clustering.%s' % (prefix, scluster_k), 'r') clusters = map(int, f.read().split()) f.close() avg_results = [] for i in range(int(scluster_k)): avg_results.append([0.0] * int(lda_k)) total = [0] * int(scluster_k) data = zip(clusters, topic_scores) for i, j in data: total[int(i)] += 1 avg_results[i] = map(lambda x: x[0] + x[1], zip(j, avg_results[i])) for i in range(len(total)): for j in range(len(avg_results[i])): avg_results[i][j] /= float(total[i]) t = avg_results for i in t: s = float(sum(i)) for j in range(len(i)): i[j] /= s sim_results = dict([(i, []) for i in range(int(scluster_k))]) for i, j in data: sim_results[i].append(j) in_sims = [0.0] * int(scluster_k) ex_sims = [0.0] * int(scluster_k) total = 0 total_sim = 0.0 for i in sim_results: t = 0 for j in range(len(sim_results[i])): for k in range(len(sim_results[i])): if j != k: t += 1 in_sims[i] += dot(sim_results[i][j], sim_results[i][k])**2 total += t total_sim += in_sims[i] in_sims[i] /= float(t) for i in sim_results: t = 0 for a in sim_results[i]: for j in sim_results: if i != j: for b in sim_results[j]: t += 1 ex_sims[i] += dot(a, b)**2 total_sim += ex_sims[i] total += t ex_sims[i] /= float(t) global_sim = (total_sim/float(total)) sim_category = range(int(scluster_k)) total = [0] * int(scluster_k) for c in clusters: total[int(c)] += 1 mat = [] for t in range(0, len(topic_scores[0])): mat.append([0] * int(scluster_k)) top = 20 for i in range(0, len(topic_scores[0])): l = sorted(data, key=lambda x: x[1][i])[0:top] for j in l: mat[i][j[0]] += 1 mat = transpose(mat) for j in mat: s = float(sum(j)) for x in range(len(j)): j[x] /= s components = [form.Table('normalize average topic scores', h_category=range(int(lda_k)), v_category=[i for i in range(len(avg_results))], cross='lda/scluster', use_id=True, pairs=avg_results, border="1", cellspacing="0", cellpadding="3",), form.tr(''), form.tr(''), form.Table('average internal similarities', h_category=sim_category, pairs=[in_sims], use_id=True, border="1", cellspacing="0", cellpadding="3",), form.tr(''), form.tr(''), form.Table('internal ratio', h_category=sim_category, use_id=True, pairs=[map(lambda x: x/global_sim, in_sims)], border="1", cellspacing="0", cellpadding="3",), form.tr(''), form.tr(''), form.Table('average external similarities', h_category=sim_category, pairs=[ex_sims], use_id=True, border="1", cellspacing="0", cellpadding="3",), form.tr(''), form.tr(''), form.Table('external ratio', h_category=sim_category, use_id=True, pairs=[map(lambda x: x/global_sim, ex_sims)], border="1", cellspacing="0", cellpadding="3",), form.tr(''), form.tr(''), form.Table('#docs among top 20 in topic i that are in cluster j normalized by the number of nodes in each scluster:', h_category=range(int(lda_k)), pairs=mat, v_category=[i for i in range(len(mat))], use_id = True, cross='lda/scluster', border="1", cellspacing="0", cellpadding="3",), form.tr(''), ] js = form.js('script', 'function create_graph(id) { alert(id);}') return apply(form.Form, components, {'js': js})()
def getSclusterVsVcluster(prefix, k): format_float = '%.5f' f = open('data/%s/data.out.clustering.%s.output' % (prefix, k), 'r') g = open('data/%s/data.mat.clustering.%s.output' % (prefix, k), 'r') s_stats = f.read().split('\n') v_stats = g.read().split('\n') (s_matches, category) = getMatches(s_stats) (v_matches, _) = getMatches(v_stats) f.close() g.close() f = open('data/%s/data.out.clustering.%s' % (prefix, k), 'r') g = open('data/%s/data.mat.clustering.%s' % (prefix, k), 'r') clusters1 = f.read().split() clusters2 = g.read().split() cov = [[0] * (int(k)+1) for i in range(int(k))] for i, j in zip(clusters1, clusters2): cov[int(i)][int(j)+1] += 1 total = [0] * (int(k)+1) for c in clusters2: total[int(c)+1] += 1 total_nodes = sum(total) for i in cov: s = float(sum(i)) for j in range(len(i)): i[j] /= s i[j] -= (total[j]/float(total_nodes)) fractions = [j/float(total_nodes) for j in total] print_cov = [] for i in cov: lst = [] for j in i: lst.append(format_float % j) print_cov.append(lst) components = [#form.Table('Ref stats', use_id=True, border="1", # cellspacing="0", cellpadding="3", # h_category=range(int(k)), # v_category=['in-stats', 'out-stats', 'total-nodes'], # pairs=[results['in-stats'], results['out-stats'], # results['total']]), #form.tr(''), #form.tr(''), form.Table('SCluster Similarities', use_id=True, border="1", cellspacing="0", cellpadding="3", h_category=category, v_category=range(len(s_matches)), pairs=s_matches), form.tr(''), form.tr(''), form.Table('VCluster Similarities', use_id=True, border="1", cellspacing="0", cellpadding="3", v_category=range(len(v_matches)), h_category=category, pairs = v_matches), form.tr(''), form.Table('Cov', use_id=True, border="1", cellspacing="0", cellpadding="3", h_category=range(-1, int(k)), pairs = print_cov), form.tr(''), ] j = 0 for i in cov: r = sorted(zip(i, range(-1, int(k))), key=lambda x: -x[0]) pairs = map(lambda x: format_float % (x[0]), r) cate = map(lambda x: str((x[1], format_float % fractions[x[1]])), r) components.append(form.Table('cov%d' % (j), border="1", cellspacing="0", cellpadding="3", h_category=cate, v_category=[j], pairs=[pairs], use_id=True)) j += 1 components.append(form.tr('')) variables = 'var link = "/van_graph?prefix1=%s&prefix2=%s&k=%s&l=%s";\n' % (prefix, prefix, k, 0) js = form.js('script', variables + 'function create_graph(id) { window.open(link + "&cell=" + id + "@scluster@vcluster");}\n') return apply(form.Form, components, {'js':js})()
def getDegreeComparison(prefix1, prefix2, k, l): format_float = '%.5f' from runcluster import filterWithI results = filterWithI(prefix1, k, l, 'out') results['g0-similarities'] = [map(lambda x: x[i], results['g0-similarities']) for i in range(2)] WORDS = '[a-zA-Z]+' SPACES = '[\s]+' NUMBERS = '[\d]+' SIM = '[+-\.\d]+' f = open('data/%s/data.out.clustering.%s.output' % (prefix2, k), 'r') g = open('data/%s/data.out.clustering.%s' % (prefix2, k), 'r') clusters1 = results['new_clusters'] clusters2 = g.read().split() cov = [[0] * int(k) for i in range(int(k))] for i, j in zip(clusters1, clusters2): cov[int(i)][int(j)] += 1 total = results['total'] total_nodes = sum(total) for i in cov: s = float(sum(i)) for j in range(len(i)): i[j] /= s i[j] -= (total[j]/float(total_nodes)) fractions = [j/float(total_nodes) for j in total] matches = [] category_regex = ('[\s]*' + '(%s)' % (WORDS) + SPACES + '(%s)' % (WORDS) + SPACES + '(%s)' % (WORDS) + SPACES + '(%s)' % (WORDS) + SPACES + '(%s)' % (WORDS) + SPACES + '(%s)' % (WORDS) + SPACES + '|' + SPACES + '\n' ) stats_regex = (SPACES + '(%s)' % (NUMBERS) + SPACES + '(%s)' % (NUMBERS) + SPACES + '(%s)' % (SIM) + SPACES + '(%s)' % (SIM) + SPACES + '(%s)' % (SIM) + SPACES + '(%s)' % (SIM) + SPACES + '|' + SPACES + '\n' ) category = None for x in f: y = match(stats_regex, x) if y: matches.append(list(y.groups())) else: y = match(category_regex, x) if y: category = list(y.groups()) g0_similarities = map(lambda x: [x], total) map(lambda x: x[0].extend(x[1]), zip(g0_similarities, transpose(results['g0-similarities']))) tmp = [(i, g0_similarities[i]) for i in range(len(g0_similarities))] tmp = sorted(tmp, key=lambda x: -x[1][0]) g0_similarities = map(lambda x: x[1], tmp) g0_v_category = map(lambda x: x[0], tmp) for i in g0_similarities: for j in range(2, len(i)): sign = '-' if i[j] > 0: sign = '+' i[j] = sign + format_float % i[j] g0_category = list(category) g0_category.pop(3) g0_category.pop(4) print_cov = [] for i in cov: lst = [i[0]] for j in i[1:]: lst.append(format_float % j) print_cov.append(lst) components = [form.Table('Ref stats', use_id=True, border="1", cellspacing="0", cellpadding="3", h_category=range(int(k)), v_category=['in-stats', 'out-stats', 'total-nodes'], pairs=[results['in-stats'], results['out-stats'], results['total']]), form.tr(''), form.tr(''), form.Table('G0 Similarities', use_id=True, border="1", cellspacing="0", cellpadding="3", h_category=g0_category, v_category=g0_v_category, pairs=g0_similarities), form.tr(''), form.tr(''), form.Table('G%s Similarities' % (l), use_id=True, border="1", cellspacing="0", cellpadding="3", v_category=range(len(matches)), h_category=category, pairs = matches), form.tr(''), form.Table('Cov', use_id=True, border="1", cellspacing="0", cellpadding="3", h_category=range(int(k)), pairs = print_cov), form.tr(''), ] j = 0 for i in cov: r = sorted(zip(i, range(int(k))), key=lambda x: -x[0]) pairs = map(lambda x: format_float % (x[0]), r) cate = map(lambda x: str((x[1], format_float % fractions[x[1]])), r) components.append(form.Table('cov%d' % (j), border="1", cellspacing="0", cellpadding="3", h_category=cate, v_category=[j], pairs=[pairs], use_id=True)) j += 1 components.append(form.tr('')) variables = 'var link = "/van_graph?prefix1=%s&prefix2=%s&k=%s&l=%s";\n' % (prefix1, prefix2, k, l) js = form.js('script', variables + 'function create_graph(id) { window.open(link + "&cell=" + id + "@scluster@scluster");}\n') return apply(form.Form, components, {'js':js})()
def getMergeComparison(prefix): format_float = '%.5f' from runcluster import filterWithI results = filterWithI(prefix, k, 0, 'abstract.matrix') f = open('data/%s/data.out.clustering.%s.output' % (prefix, k), 'r') g = open('data/%s/data.abstract.matrix.clustering.%s.output' % (prefix, k), 'r') s_stats = f.read().split('\n') v_stats = g.read().split('\n') (s_matches, category) = getMatches(s_stats) (v_matches, _) = getMatches(v_stats) f.close() g.close() f = open('data/%s/data.out.clustering.%s' % (prefix, k), 'r') g = open('data/%s/data.abstract.matrix.clustering.%s' % (prefix, k), 'r') clusters1 = f.read().split() clusters2 = g.read().split() total1 = [0] * (int(k)) total2 = [0] * (int(k)) for i in clusters1: total1[int(i)] += 1 for i in clusters2: total2[int(i)] += 1 total = results['total'] cov = [[0] * (int(k)) for i in range(int(k))] for i, j in zip(clusters1, clusters2): cov[int(i)][int(j)] += 1 total_nodes = sum(total) cov1 = [] for i in cov: s = float(sum(i)) lst = [] for j in range(len(i)): lst.append(i[j] / s) i[j] /= s i[j] -= (total[j]/float(total_nodes)) cov1.append(lst) fractions = [j/float(total_nodes) for j in total] print 'total', total g0_similarities = map(lambda x: [x], total) map(lambda x: x[0].extend(x[1]), zip(g0_similarities, results['g0-similarities'])) tmp = [(i, g0_similarities[i]) for i in range(len(g0_similarities))] tmp = sorted(tmp, key=lambda x: -x[1][0]) g0_similarities = map(lambda x: x[1], tmp) g0_v_category = map(lambda x: x[0], tmp) for i in g0_similarities: for j in range(2, len(i)): sign = '-' if i[j] > 0: sign = '+' i[j] = sign + format_float % i[j] g0_category = list(category) g0_category.pop(3) g0_category.pop(4) g0_category.pop(0) print_cov = [] for i in cov: lst = [] for j in i: lst.append(format_float % j) print_cov.append(lst) components = [#form.Table('Ref stats', use_id=True, border="1", # cellspacing="0", cellpadding="3", # h_category=range(int(k)), # v_category=['in-stats', 'out-stats', 'total-nodes'], # pairs=[results['in-stats'], results['out-stats'], # results['total']]), #form.tr(''), #form.tr(''), form.Table('SCluster Similarities', use_id=True, border="1", cellspacing="0", cellpadding="3", h_category=category, v_category=range(len(s_matches)), pairs=s_matches), form.tr(''), form.tr(''), form.Table('VCluster Similarities', use_id=True, border="1", cellspacing="0", cellpadding="3", v_category=range(len(v_matches)), h_category=category, pairs = v_matches), form.tr(''), form.tr(''), form.Table('G0 Similarities', use_id=True, border="1", cellspacing="0", cellpadding="3", h_category=g0_category, v_category=g0_v_category, cross='cid', pairs=g0_similarities), form.tr(''), form.tr(''), form.Table('Total', use_id=True, border="1", cellspacing="0", cellpadding="3", h_category=range(0, len(total)), pairs=[total]), form.tr(''), form.Table('Total1', use_id=True, border="1", cellspacing="0", cellpadding="3", h_category=range(0, len(total1)), pairs=[total1]), form.tr(''), form.Table('Total2', use_id=True, border="1", cellspacing="0", cellpadding="3", h_category=range(0, len(total2)), pairs=[total2]), form.tr(''), form.Table('fractions', use_id=True, border="1", cellspacing="0", cellpadding="3", h_category=range(0, len(fractions)), pairs=[fractions]), form.tr(''), form.Table('Cov', use_id=True, border="1", cellspacing="0", cellpadding="3", h_category=range(0, int(k)), pairs = print_cov), form.tr(''), #form.Table('Cov', use_id=True, border="1", # cellspacing="0", cellpadding="3", # h_category=range(0, int(k)), pairs = cov1), #form.tr(''), ] j = 0 for i in cov: r = sorted(zip(i, range(0, int(k))), key=lambda x: -x[0]) pairs = map(lambda x: format_float % (x[0]), r) cate = map(lambda x: str((x[1], format_float % fractions[x[1]])), r) components.append(form.Table('cov%d' % (j), border="1", cellspacing="0", cellpadding="3", h_category=cate, v_category=[j], pairs=[pairs], use_id=True)) j += 1 components.append(form.tr('')) variables = 'var link = "/van_graph?prefix1=%s&prefix2=%s&k=%s&l=0";\n' % (prefix, prefix, k) js = form.js('script', variables + 'function create_graph(id) { window.open(link + "&cell=" + id + "@scluster@vcluster_tfidf");}\n') return apply(form.Form, components, {'js':js})()