class SimTopicLists: """ Compare similarities between topics in two topic lists """ def __init__(self): self.sim = Similarity() def bha_distance(self, t_list1, t_list2): """ Compare the Bhattacharyya Distance between each of two topics in two topic lists and store the results in a 2D list Example t_list1 contains t1_0, t1_1, t1_2 t_list2 contains t2_0, t2_1, t2_2 distance list: [[sim(t1_0,t2_0), sim(t1_0, t2_1), sim(t1_0, t2_2)], [sim(t1_1,t2_0), sim(t1_1, t2_1), sim(t1_1, t2_2)], [sim(t1_2,t2_0), sim(t1_2, t2_1), sim(t1_2, t2_2)]] The following distance methods have similar outputs :return: a 2D list stores the results """ distance_list = [] for value1 in t_list1: sub_list = [ self.sim.bha_distance(value1, value2) for value2 in t_list2 ] distance_list.append(sub_list) return distance_list def kl_divergence(self, t_list1, t_list2): """ Compare the KL Divergence between each of two topics in two topic lists and store the results in a 2D list :return: a 2D list stores the results """ distance_list = [] for value1 in t_list1: sub_list = [ self.sim.kl_divergence(value1, value2) for value2 in t_list2 ] distance_list.append(sub_list) return distance_list def cos_distance(self, t_list1, t_list2): """ Compare the cosine distance between each of two topics in two topic lists and store the results in a 2D list :return: a 2D list stores the results """ distance_list = [] for value1 in t_list1: sub_list = [ self.sim.cosine_distance(value1, value2) for value2 in t_list2 ] distance_list.append(sub_list) return distance_list def kendall(self, t_list1, t_list2): """ Compare the kendall tau correlation between each of two topics in two topic lists and store the results in a 2D list :return: a 2D list stores the results """ distance_list = [] for index1, value1 in enumerate(t_list1): sub_list = [] for index2, value2 in enumerate(t_list2): result = self.sim.kendall_tau(value2, value1) sub_list.append(result) distance_list.append(sub_list) return distance_list def dcg(self, t_list1, t_list2, word_limit=0): """ Compare the difference between dcg values of each two topics in two topic lists and store the results in a 2D list :return: a 2D list stores the results """ distance_list = [] if word_limit == 0: word_limit = len(t_list1) for value1 in t_list1: sub_list = [ self.sim.dcg_difference(value1, value2, word_limit) for value2 in t_list2 ] distance_list.append(sub_list) return distance_list def jaccard(self, t_list1, t_list2, threshold): """ Compare the jaccard distance between each of two in two topic lists and store the results in a 2D list :return: a 2D list stores the """ distance_list = [] for value1 in t_list1: sub_list = [ self.sim.jaccard_distance(value1, value2, threshold) for value2 in t_list2 ] distance_list.append(sub_list) return distance_list def write_distance(self, distance_list, ofile): """ Write distance values between two topics in a topic list Assume one topic list t_list1 contains t0, t1, t2 :param distance_list: generated by one of similarity/distance methods above i.e. [[sim(t0,t0), sim(t0,t1), sim(t0,t2)], [sim(t1,t0), sim(t1, t1), sim(t1,t2)], [sim(t2,t0), sim(t2,t1), sim(t2,t2)]] :param ofile: the output file :return: a list of similarity/distance values i.e. [sim(t0,t1), sim(t0,t2), sim(t1,t2)] """ for i1, sublist in enumerate(distance_list): for i2, value in enumerate(sublist[i1 + 1:]): ofile.write(str(value) + "\n") def read_distance_list(self, ifile): """ Read a distance list from an output file :param ifile: the input file :return: a list of similarity/distance values """ dist_list = [] for line in ifile: dist_list.append(float(line)) return dist_list def give_dist_names(self, dist_list, topics_count, corpus_type): """ Turn a distance list into a list of (topic_pair_name,similarity value) tuples This new list will be used in the calculation of kendalltau correlations among different similarity measures. :param dist_list: a distance list generated by a tfidf 3-topic LDA i.e. Suppose a topic list t_list1 contains t0, t1, t2 and [sim(t0,t1), sim(t0,t2), sim(t1,t2)] corresponds to [0.3, 0.2, 0.4] :param topics_count: the number of topics in the topic list that related with the distance list :param corpus_type: the corpus type of the LDA that generates this topic list :return: [(tfidf_t3t0_t1:0.3), (tfidf_t3t0_t2:0.2), (tfidf_t3t1_t2:0.4)) """ dist_names = [] index = 0 for t1 in range(topics_count): for t2 in range(t1 + 1, topics_count): dist_names.append( (corpus_type + "_t" + str(topics_count) + "t" + str(t1) + "_t" + str(t2), dist_list[index])) index += 1 return dist_names # Different display methods def find_smallest(self, num_list): if num_list[0] < num_list[1]: row_min, row_min2, i1, i2 = num_list[0], num_list[1], 0, 1 else: row_min, row_min2, i1, i2 = num_list[1], num_list[0], 1, 0 for index, value in enumerate(num_list): if value < row_min: row_min2, i2 = row_min, i1 row_min, i1 = value, index elif (row_min < value < row_min2) or (row_min == value and i1 != index): row_min2, i2 = value, index return i1, i2 def find_smallest_self(self, num_list): if num_list[0] < num_list[1]: row_min, row_min2, i1, i2 = num_list[0], num_list[1], 0, 1 else: row_min, row_min2, i1, i2 = num_list[1], num_list[0], 1, 0 for index, value in enumerate(num_list): if value < row_min: row_min2, i2 = row_min, i1 row_min, i1 = value, index elif (row_min < value < row_min2) or (row_min == value and i1 != index): row_min2, i2 = value, index return i1, i2 def find_largest_two(self, num_list): if num_list[0] > num_list[1]: row_max, row_max2, i1, i2 = num_list[0], num_list[1], 0, 1 else: row_max, row_max2, i1, i2 = num_list[1], num_list[0], 1, 0 for index, value in enumerate(num_list): if value > row_max: row_max2, i2 = row_max, i1 row_max, i1 = value, index elif (row_max > value > row_max2) or (row_max == value and i1 != index): row_max2, i2 = value, index return i1, i2 def find_largest_one(self, num_list): lmax = num_list[0] i = 0 for index, value in enumerate(num_list): if value > lmax: lmax = value i = index return i def rank(self, nlist): # rank = 'a b c d e f g h i j k l m n o p q r s t u v w x y z'.split() rank = list(range(0, len(nlist))) sorted_list = list(sorted(nlist)) ranklist = {} for index, num in enumerate(sorted_list): ranklist[num] = rank[index] newlist = [] for index, num in enumerate(nlist): newlist.append( str(ranklist[num]) + " " + str('{0:.6f}'.format(num))) return newlist def show_results_rank(self, distance_list, file): """ Show the results from the comp_topic_lists method :param distance_list: a 2D list of data """ width = 14 file.write( "\ntopic List 1 is vertical and topic List 2 is horizontal\n") # calculate smallest results # show row labels file.write("<style>table, th, td {border: 1px solid black;}</style>") file.write("<table><thead><tr><th></th>") for value in range(len(distance_list[0])): file.write("<th> topic" + str(value) + "</th>") file.write("</tr></thead><tbody>") colordiff = int(16777215 / (len(distance_list[0]))) for index, sublist in enumerate(distance_list): # show column labels file.write("<tr><td> topic" + str(index) + "</td>") # rank sublist = self.rank(sublist) for sub_i, value in enumerate(sublist): ranknum = float(value.split()[0]) colornum = int(16777215 - ranknum * colordiff) color = format(colornum, "06X") file.write("<td><span style='background-color: #" + str(color) + "'>") file.write(value.split()[1]) file.write("</span></td>") file.write("</tr>") file.write("</tbody></table>") def show_results_rank_bw(self, distance_list, file): """ Show the results from the comp_topic_lists method :param distance_list: a 2D list of data """ width = 14 file.write( "\ntopic List 1 is vertical and topic List 2 is horizontal\n") # calculate smallest results # show row labels file.write("<style>table, th, td {border: 1px solid black;}</style>") file.write("<table><thead><tr><th></th>") for value in range(len(distance_list[0])): file.write("<th> topic" + str(value) + "</th>") file.write("</tr></thead><tbody>") colordiff = int(255 / (len(distance_list[0]))) for index, sublist in enumerate(distance_list): # show column labels file.write("<tr><td> topic" + str(index) + "</td>") # rank sublist = self.rank(sublist) for sub_i, value in enumerate(sublist): ranknum = int(value.split()[0]) colornum = str(255 - ranknum * colordiff) rgbstr = "rgb(" + colornum + "," + colornum + "," + colornum + ")" file.write("<td><span style='background-color: " + rgbstr + "'>") file.write(value.split()[1]) file.write("</span></td>") file.write("</tr>") file.write("</tbody></table>") def show_results_rank_reverse(self, distance_list, file): """ Show the results from the comp_topic_lists method :param distance_list: a 2D list of data """ width = 14 file.write( "\ntopic List 1 is vertical and topic List 2 is horizontal\n") # calculate smallest results # show row labels file.write("<style>table, th, td {border: 1px solid black;}</style>") file.write("<table><thead><tr><th></th>") for value in range(len(distance_list[0])): file.write("<th> topic" + str(value) + "</th>") file.write("</tr></thead><tbody>") colordiff = int(255.0 / (len(distance_list[0]))) for index, sublist in enumerate(distance_list): # show column labels file.write("<tr><td> topic" + str(index) + "</td>") # rank sublist = self.rank(sublist) for sub_i, value in enumerate(sublist): ranknum = int(value.split()[0]) colornum = str(ranknum * colordiff) rgbstr = "rgb(" + colornum + "," + colornum + "," + colornum + ")" file.write("<td><span style='background-color: " + rgbstr + "'>") file.write(value.split()[1]) file.write("</span></td>") file.write("</tr>") file.write("</tbody></table>") def show_results_value(self, distance_list, file): """ Show the results from the comp_topic_lists method :param distance_list: a 2D list of data """ width = 14 file.write( "\ntopic List 1 is vertical and topic List 2 is horizontal\n") # calculate smallest results # show row labels file.write("<style>table, th, td {border: 1px solid black;}</style>") file.write("<table><thead><tr><th></th>") for value in range(len(distance_list[0])): file.write("<th> topic" + str(value) + "</th>") file.write("</tr></thead><tbody>") max_value = max([max(v) for v in distance_list]) min_value = min([min(v) for v in distance_list]) for index, sublist in enumerate(distance_list): # show column labels file.write("<tr><td> topic" + str(index) + "</td>") for sub_i, value in enumerate(sublist): percent = (value - min_value) / (max_value - min_value) colornum = str(int(255 * (1 - percent))) rgbstr = "rgb(" + colornum + "," + colornum + "," + colornum + ")" file.write("<td><span style='background-color: " + rgbstr + "'>") value = '{0:.6f}'.format(value) file.write(str(value)) file.write("</span></td>") file.write("</tr>") file.write("</tbody></table>") def show_results_value_reverse(self, distance_list, file): """ Show the results from the comp_topic_lists method :param distance_list: a 2D list of data """ width = 14 file.write( "\ntopic List 1 is vertical and topic List 2 is horizontal\n") # calculate smallest results # show row labels file.write("<style>table, th, td {border: 1px solid black;}</style>") file.write("<table><thead><tr><th></th>") for value in range(len(distance_list[0])): file.write("<th> topic" + str(value) + "</th>") file.write("</tr></thead><tbody>") max_list = [] for sub in distance_list: max_list.append(max([v for v in sub if round(v, 6) != 1.000000])) max_value = max(max_list) min_value = min([min(v) for v in distance_list]) for index, sublist in enumerate(distance_list): # show column labels file.write("<tr><td> topic" + str(index) + "</td>") for sub_i, value in enumerate(sublist): if round(value, 6) == 1.000000: colornum = "255" else: percent = (value - min_value) / (max_value - min_value) colornum = str(int(255 * percent)) rgbstr = "rgb(" + colornum + "," + colornum + "," + colornum + ")" file.write("<td><span style='background-color: " + rgbstr + "'>") value = '{0:.6f}'.format(value) file.write(str(value)) file.write("</span></td>") file.write("</tr>") file.write("</tbody></table>") def show_results(self, distance_list, file): """ Show the results from the comp_topic_lists method :param distance_list: a 2D list of data """ width = 14 file.write( "\ntopic List 1 is vertical and topic List 2 is horizontal\n") # calculate smallest results # show row labels file.write("{:{w}}".format("T_List1/T_List2", w=width)) for value in range(len(distance_list[0])): file.write('{:{w}}'.format(" topic" + str(value), w=width)) file.write("\n") for index, sublist in enumerate(distance_list): # show column labels file.write('{:{w}}'.format(" topic" + str(index), w=width)) # find row and col max rmax = self.find_largest_one(sublist) cmax_list = [] for i in range(len(sublist)): col_list = [v[i] for v in distance_list] cmax_list.append(self.find_largest_one(col_list)) for sub_i, value in enumerate(sublist): print value value = '{0:.6f}'.format(value) if sub_i == rmax and index != cmax_list[sub_i]: file.write('{:{w}}'.format("| **" + value, w=width)) elif sub_i != rmax and index == cmax_list[sub_i]: file.write('{:{w}}'.format("| ++" + value, w=width)) elif sub_i == rmax and index == cmax_list[sub_i]: file.write('{:{w}}'.format("| *+" + value, w=width)) else: file.write('{:{w}}'.format("| " + value, w=width)) file.write("\n" + "-" * width * (len(distance_list[0]) + 1) + "\n") def show_results_self(self, distance_list, file): """ Show the results from the comp_topic_lists method :param distance_list: a 2D list of data """ width = 14 file.write( "\ntopic List 1 is vertical and topic List 2 is horizontal\n") # calculate smallest results # show row labels file.write("{:{w}}".format("T_List1/T_List2", w=width)) for value in range(len(distance_list[0])): file.write('{:{w}}'.format(" topic" + str(value), w=width)) file.write("\n") for index, sublist in enumerate(distance_list): # show column labels file.write('{:{w}}'.format(" topic" + str(index), w=width)) # find row and col max rmax = self.find_largest_one(sublist) for sub_i, value in enumerate(sublist): value = '{0:.6f}'.format(value) if sub_i == rmax: file.write('{:{w}}'.format("| **" + value, w=width)) else: file.write('{:{w}}'.format("| " + value, w=width)) file.write("\n" + "-" * width * (len(distance_list[0]) + 1) + "\n") def show_results_2min_self(self, distance_list, file): """ Show the results from the comp_topic_lists method :param distance_list: a 2D list of data """ width = 14 file.write( "\ntopic List 1 is vertical and topic List 2 is horizontal\n") # show row labels file.write("{:{w}}".format("T_List1/T_List2", w=width)) for value in range(len(distance_list[0])): file.write('{:{w}}'.format(" topic" + str(value), w=width)) file.write("\n") for index, sublist in enumerate(distance_list): # show column labels file.write('{:{w}}'.format(" topic" + str(index), w=width)) # show data in each cell min_1, min_2 = self.find_smallest(sublist) for sub_i, value in enumerate(sublist): value = '{0:.6f}'.format(value) if sub_i == min_1: file.write('{:{w}}'.format("|##" + value, w=width)) elif sub_i == min_2: file.write('{:{w}}'.format("| #" + value, w=width)) else: file.write('{:{w}}'.format("| " + value, w=width)) file.write("\n" + "-" * width * (len(distance_list[0]) + 1) + "\n") def show_results_2max_self(self, distance_list, file): """ Show the results from the comp_topic_lists method :param distance_list: a 2D list of data """ width = 14 file.write( "\ntopic List 1 is vertical and topic List 2 is horizontal\n") # show row labels file.write("{:{w}}".format("T_List1/T_List2", w=width)) for value in range(len(distance_list[0])): file.write('{:{w}}'.format(" topic" + str(value), w=width)) file.write("\n") for index, sublist in enumerate(distance_list): # show column labels file.write('{:{w}}'.format(" topic" + str(index), w=width)) # show data in each cell max_1, max_2 = self.find_largest_two(sublist) for sub_i, value in enumerate(sublist): value = '{0:.6f}'.format(value) if sub_i == max_1: file.write('{:{w}}'.format("| " + value, w=width)) elif sub_i == max_2: file.write('{:{w}}'.format("| **" + value, w=width)) else: file.write('{:{w}}'.format("| " + value, w=width)) file.write("\n" + "-" * width * (len(distance_list[0]) + 1) + "\n") def write_distance(self, distance_list, ofile): for i1, sublist in enumerate(distance_list): for i2, value in enumerate(sublist[i1 + 1:]): ofile.write(str(value) + "\n") def read_distance_list(self, ifile): dist_list = [] for line in ifile: dist_list.append(float(line)) return dist_list def read_distance_rank(self, dist_list, topics_count, corpus_type): dist_rank = [] index = 0 for t1 in range(topics_count): for t2 in range(t1 + 1, topics_count): dist_rank.append( (corpus_type + "_t" + str(topics_count) + "t" + str(t1) + "_t" + str(t2), dist_list[index])) index += 1 return dist_rank
class BCInterpreter: """ A helper class used in the interpretation of BC Distance """ def __init__(self): self.bc = Similarity() def bc_similarity(self, seq_size, sample_size=30, sample_times=20, degree=0.1, bc_times=20): """ Calculate the average bc_distance between two similar distributions with certain parameters :param seq_size: the size of the sequence (population) :param sample_size: the size of the sample :param sample_times: times of sampling :param degree: degree of randomness. The smaller the number, the more sparse the distribution is. The larger the number, the more uniform the distribution is. :param bc_times: times to calculate bc_distance :return: an averaged bc_distance value """ bsum = 0 for num in range(bc_times): dist1 = list(numpy.random.dirichlet([degree] * seq_size)) dist2 = self.mean_rand_dist(dist1, seq_size, sample_size, sample_times) bsum += self.bc.bha_distance(self.dist_to_topic(dist1), self.dist_to_topic(dist2)) return bsum / bc_times def bc_difference(self, seq_size, degree1, degree2, bc_times=20): bsum = 0 for num in range(bc_times): dist1 = list(numpy.random.dirichlet([degree1] * seq_size)) dist2 = list(numpy.random.dirichlet([degree2] * seq_size)) bsum += self.bc.bha_distance(self.dist_to_topic(dist1), self.dist_to_topic(dist2)) return bsum / bc_times def mean_rand_dist(self, dist, seq_size, sample_size=30, sample_times=50): """ :param dist: :param seq_size: :param sample_size: :param sample_times: :return: """ dist_list = [] for num in range(sample_times): dist_list.append(self.rand_dist(dist, seq_size, sample_size)) mean_dist = [] for num in range(seq_size): mean_dist.append(sum([d[num] for d in dist_list]) / sample_times) return mean_dist def rand_dist(self, dist, seq_size, sample_size): """ Get a new distribution on a random sample from a population with a specific distribution :param dist: the distribution of the population :param seq_size: the range of the population :param sample_size: the size of the sample :return: """ # get the random sample sample = numpy.random.choice(seq_size, sample_size, p=dist) # calculate distribution sample_dist = [] for num in range(seq_size): sample_dist.append( float(list(sample).count(num)) / float(sample_size)) return sample_dist def dist_to_topic(self, dist): """ Output a topic object with the input distribution :param dist: a distribution list :return: a topic object """ topic = topicio.Topic() topic.words_dist = [(index, num) for index, num in enumerate(dist)] return topic