def FindClosestVectors(p_test_vector, p_vector_list): distance_list = [] for index in range(len(p_vector_list)): distance_list.append( jensen_shannon_distance(p_test_vector, p_vector_list[index])) return distance_list.index(min(distance_list))
def GetClosestSentenceToAbsOfVector(p_vector, p_pmf_objects): # Calculate and normalize the average of the first group for index in range(len(p_vector)): p_vector[index] = abs(p_vector[index]) distances = [] for obj in p_pmf_objects: current_vector = [] for index in range(len(spacy_pos_order)): current_vector.append(obj.pos()["pos"][spacy_pos_order[index]]) distances.append(jensen_shannon_distance(p_vector, current_vector)) return p_pmf_objects[distances.index(min(distances))].sentence()
def Build_CorpusMapJSON(corpus_name, corpus_topics, file_topic_proportions, output_dir): ''' TWiC JSON Hierarchy: Corpus -> Clusters -> Texts // Corpus { "name": <corpus_name>, "ideal_text":<file_id>, - Text with average topic distribution "distance2ideal":"NA", "topics" : { <topic_id> : [<rank>, <topic_proportion>],... }, // Clusters "children" : [ { "name":<cluster_name>, - Topic number "ideal_text":<file_id>, - Text where topic is strongest "distance2ideal":<jd distance from cluster ideal text to corpus ideal text> "topics" : { <topic_id> : [<rank>, <topic_proportion>],... }, // Texts "children":[ { "name":<text_name>, "ideal_text":<file_id> - Self "distance2ideal":<jd distance from this text to cluster ideal text> "topics": { <topic_id> : [<rank>, <topic_proportion>],... }, "children":[] },... ] },... ] } ''' # 1. Define corpus level JSON twic_corpus_map = { "name" : corpus_name, "ideal_text" : "", "distance2ideal" : "", "topics" : {}, "children" : [] } # Build a ranked map of corpus-level topics for the JSON corpus_topic_pairs = [[topic, corpus_topics[topic]] for topic in corpus_topics.keys()] sorted_corpus_topic_pairs = sorted(corpus_topic_pairs, key=lambda x:x[1], reverse=True) ranked_corpus_topic_map = {} # for index in range(0, len(sorted_corpus_topic_pairs)): # ranked_corpus_topic_map[sorted_corpus_topic_pairs[index][0]] = [index + 1, sorted_corpus_topic_pairs[index][1]] ranked_corpus_topic_map = { sorted_corpus_topic_pairs[index][0]: [index + 1, sorted_corpus_topic_pairs[index][1]] for index in range(len(sorted_corpus_topic_pairs))} twic_corpus_map["topics"] = ranked_corpus_topic_map '''# Determine average topic distribution for corpus doc_count = float(len(file_topic_proportions)) topic_count = len(corpus_topic_proportions.keys()) corpus_proportion_sums = {} avg_corpus_distribution = {} for doc in file_topic_proportions: for topic_id in topic_guide.keys(): if topic_id not in corpus_proportion_sums.keys(): corpus_proportion_sums[topic_id] = 0.0 corpus_proportion_sums[topic_id] += doc.topic_guide[topic_id] for topic_id in corpus_proportion_sums.keys(): avg_corpus_distribution[topic_id] = corpus_proportion_sums[topic_id] / doc_count ''' # Get corpus topic distribution sorted by topic id # corpus_topic_proportions = [] # for index in range(0, len(corpus_topics.keys())): # corpus_topic_proportions.append(corpus_topics[str(index)]) corpus_topic_proportions = [corpus_topics[str(index)] for index in range(len(corpus_topics.keys()))] # Find the document whose distribution is closest to that corpus topic distribution for the corpus distances_to_ideal = [] for doc in file_topic_proportions: doc_topics = sorted(doc.sorted_topic_list, key=lambda x:x[0], reverse=False) doc_distribution = [] for index in range(0, len(doc_topics)): doc_distribution.append(0.0) for index in range(0, len(doc_topics)): int_topic_id = int(doc_topics[index][0]) doc_distribution[int_topic_id] = doc_topics[index][1] #print 'Doc distr:{0}\nCorp distr:{1}'.format(doc_distribution, corpus_topic_proportions) distances_to_ideal.append([doc.id, utils_jensen_shannon.jensen_shannon_distance(corpus_topic_proportions, doc_distribution)]) distances_to_ideal = sorted(distances_to_ideal, key=lambda x:x[1], reverse=False) # Save the text closest to that average distribution and its distance from that average twic_corpus_map["ideal_text"] = distances_to_ideal[0][0] twic_corpus_map["distance2ideal"] = distances_to_ideal[0][1] # 2. Now work on defining the cluster level JSON # cluster_distance_file = open(output_dir + cluster_distance_filename, 'r') # cluster_distance_data = json.load(cluster_distance_file) # Determine topic clusters of the corpus clusters_json = TWiC_MalletInterpret.DetermineCorpusClusters(file_topic_proportions, corpus_topic_proportions) for cluster_topic_id in clusters_json.keys(): # Define a new cluster child of the corpus map current_cluster_index = len(twic_corpus_map["children"]) # Find the ideal file in the file topic proportion collection file_index = -1 for index in range(len(file_topic_proportions)): if file_topic_proportions[index].id == clusters_json[cluster_topic_id]["primary_doc"]: file_index = index break if -1 == file_index: print 'Could not find primary doc {0} for cluster {1} in ftp collection. Skipping cluster.'.format(clusters_json[cluster_topic_id]["primary_doc"], cluster_topic_id) continue # Build a ranked list of the cluster topics for the ideal text of this cluster # cluster_topic_pairs = [] # for topic in file_topic_proportions[file_index].topic_guide: # cluster_topic_pairs.append([topic,file_topic_proportions[file_index].topic_guide[topic]]) cluster_topic_pairs = [[topic, file_topic_proportions[file_index].topic_guide[topic]] for topic in file_topic_proportions[file_index].topic_guide] sorted_cluster_topic_pairs = sorted(cluster_topic_pairs, key=lambda x:x[1], reverse=True) # ranked_cluster_topic_map = {} # for index in range(0, len(sorted_cluster_topic_pairs)): # ranked_cluster_topic_map[sorted_cluster_topic_pairs[index][0]] = [index + 1, sorted_cluster_topic_pairs[index][1]] ranked_cluster_topic_map = { sorted_cluster_topic_pairs[index][0]: [index + 1, sorted_cluster_topic_pairs[index][1]] for index in range(len(sorted_cluster_topic_pairs))} # Define each cluster level JSON entry twic_corpus_map["children"].append({ "name" : cluster_topic_id, "ideal_text" : clusters_json[cluster_topic_id]["primary_doc"], "distance2ideal" : clusters_json[cluster_topic_id]["distance2cdist"], "topics" : ranked_cluster_topic_map, "children" : [] }) # 3. Work on the text level JSON my_text_index = -1 for entry in clusters_json[cluster_topic_id]["linked_docs"]: for index in range(len(file_topic_proportions)): if file_topic_proportions[index].fileid == entry[1]: my_text_index = index break twic_corpus_map["children"][current_cluster_index]["children"].append({ "name" : entry[1], "ideal_text" : entry[0], "distance2ideal" : entry[2], "topics" : file_topic_proportions[my_text_index].topic_guide, "children" : [] }) # 4. Write out corpus map to JSON with open(output_dir + 'twic_corpusmap.json','w') as output_file: output_file.write(json.dumps(twic_corpus_map))
def DetermineCorpusClusters_Avg(file_topic_proportions, corpus_topic_proportions): clusters_json = {} topic_count = len(corpus_topic_proportions) file_count = len(file_topic_proportions) # print "============================" # print "DetermineCorpusClusters_Avg" # print "\nTopic Count:{0}\nFile Count: {1}".format(topic_count, file_count) for topic_id in range(topic_count): # print "\nProcessing cluster {0}".format(topic_id) # Clusters have name, dist2avg, topics, and text-level children clusters_json[topic_id] = { "name": topic_id, "children": [] } # Find all texts with this topic as their top topic texts_with_top_topic = [] for index in range(file_count): if topic_id == int(file_topic_proportions[index].sorted_topic_list[0][0]): texts_with_top_topic.append(index) # print "Texts with top topic {0}: {1}".format(topic_id, texts_with_top_topic) # Get the average topic distribution for this cluster cluster_avg_topic_dist = [0 for index in range(topic_count)] for index in range(len(texts_with_top_topic)): for index2 in range(topic_count): #print "TOPIC GUIDE:\n{0}".format(file_topic_proportions[texts_with_top_topic[index]].topic_guide) cluster_avg_topic_dist[index2] += file_topic_proportions[texts_with_top_topic[index]].topic_guide[str(index2)] for index in range(topic_count): cluster_avg_topic_dist[index] /= topic_count # Get its distance from the corpus distribution clusters_json[topic_id]["dist2avg"] = utils_jensen_shannon.jensen_shannon_distance(corpus_topic_proportions, cluster_avg_topic_dist) # Sort and store the average topic distribution for this cluster clusters_json[topic_id]["topics"] = {} cluster_topic_list = [] for index in range(topic_count): clusters_json[topic_id]["topics"][index] = [0, cluster_avg_topic_dist[index]] cluster_topic_list.append([index, cluster_avg_topic_dist[index]]) cluster_topic_list = sorted(cluster_topic_list, key=lambda x:x[1], reverse=True) for rank in range(len(cluster_topic_list)): clusters_json[topic_id]["topics"][cluster_topic_list[rank][0]][0] = rank + 1 # Now add the text-level children for index in range(len(texts_with_top_topic)): current_ftp = file_topic_proportions[texts_with_top_topic[index]] text_json = { "name": current_ftp.fileid } # Get the ranked topics/topic proportions for this text text_json["topics"] = {topic:[] for topic in range(topic_count)} for ranked_topic_pair_index in range(len(current_ftp.sorted_topic_list)): text_json["topics"][int(current_ftp.sorted_topic_list[ranked_topic_pair_index][0])].append(ranked_topic_pair_index) text_json["topics"][int(current_ftp.sorted_topic_list[ranked_topic_pair_index][0])].append(current_ftp.sorted_topic_list[ranked_topic_pair_index][1]) # Calculate the distance between the cluster's average topic distribution and this text's topic distribution text_topic_distribution = [current_ftp.topic_guide[str(index)] for index in range(len(corpus_topic_proportions))] text_json["dist2avg"] = utils_jensen_shannon.jensen_shannon_distance(cluster_avg_topic_dist, text_topic_distribution) # Add this text to the cluster json clusters_json[topic_id]["children"].append(text_json) # print "============================" return clusters_json
def DetermineCorpusClusters(file_topic_proportions, corpus_topic_proportions): # Clustering task # 1. Determine which files contain the top proportion of a topic # a. Those files' overall topic proportion composition become a standard around which we can # base probability distribution (topic proportion composition of other texts) comparisons # 2. So now we have N distributions representative of N topics. # a. For each file/topic proportion composition we want to compare each other file's topic proportion # composition. This renders a comparison of N files * N files, a guaranteed O(n^2) comparison - # really O(n^2) - n. # b. Optimization question - Can we trim further than O(n^2) - n? # c. Once all comparisons are done we have an N * N array of probability distribution distances. # 3. Clustering parametrization becomes a question. # a. How many documents should cluster toward the topic proportion composition standard? # But is this really a question? Let's look at clustering algorithms... # b. Take the smallest distance for each file and assign it to a list of size N, representing the list # of potential clusters (and also, it happens topics). # What we have - A list of file-topic proportion objects (for each file) which contain: # a. MALLET-assigned ID # b. File ID sans path and sans extension # c. Full filepath # d. Topic guide which matches topic id to proportion # e. A list of (topic,topic proportion) pairs sorted by proportion in descending order # 1. # Index of list will match topic IDs top_proportions = {} for doc in file_topic_proportions: for topic_id in doc.topic_guide.keys(): int_topic_id = int(topic_id) topic_proportion = doc.topic_guide[topic_id] if int_topic_id not in top_proportions.keys(): top_proportions[int_topic_id] = [doc.id, topic_proportion] else: if top_proportions[int_topic_id][1] < topic_proportion: top_proportions[int_topic_id][0] = doc.id top_proportions[int_topic_id][1] = topic_proportion # 2. # Build a list of lists of Jensen-Shannon distances for each ideal distribution jsd_buckets = { key : {} for key in top_proportions.keys() } # for key in top_proportions.keys(): # jsd_buckets[key] = {} # Get a list of distributions for all files (Mallet file ID mapped to the full distribution) prob_distributions = {} for doc in file_topic_proportions: distribution = [doc.topic_guide[str(index)] for index in range(len(top_proportions))] # for index in range(0, len(top_proportions)): # distribution.append(doc.topic_guide[str(index)]) prob_distributions[doc.id] = distribution # Build a list of JSD distances compared to that distribution for every other file for key in top_proportions.keys(): top_file_probdistr = prob_distributions[top_proportions[key][0]] #print 'Topic {0} File ID: {1} Top Distribution: {2}'.format(key, top_proportions[key][0], prob_distributions[top_proportions[key][0]]) for doc in file_topic_proportions: if doc.id == top_proportions[key][0]: jsd_buckets[key][doc.id] = 0 else: jsd_buckets[key][doc.id] = utils_jensen_shannon.jensen_shannon_distance(top_file_probdistr, prob_distributions[doc.id]) # 3. # MALLET file ids will be assigned to the cluster buckets, keyed by topic id # Create a smallest distance list smallest_distances = {} for doc in file_topic_proportions: distances = [] for topic_id in jsd_buckets.keys(): #if ftp.id in jsd_buckets[topic_id].keys(): distances.append([topic_id, jsd_buckets[topic_id][doc.id]]) distances = sorted(distances, key=lambda x:x[1], reverse=False) #print 'Distances for {0}: {1}'.format(ftp.id, distances) smallest_distances[doc.id] = distances[0][0] topic_clusters = {} for topic_id in jsd_buckets.keys(): topic_clusters[topic_id] = [] for doc in file_topic_proportions: if topic_id == smallest_distances[doc.id]: topic_clusters[topic_id].append(doc.id) file_count = 0 for key in topic_clusters.keys(): file_count += len(topic_clusters[key]) #print 'Topic {0}, Corpus Proportion: {1} Length of Cluster list: {2}'.format(key, ctp_index[key], len(topic_clusters[key])) #print 'File count in cluster map: {0}'.format(file_count) distance2cdist_map = {} for topic_id in top_proportions.keys(): file_id = top_proportions[int(topic_id)][0] doc_distribution = prob_distributions[file_id] #print 'Doc Dist Len: {0}\nDoc Dist: {1}'.format(len(doc_distribution), doc_distribution) distance = utils_jensen_shannon.jensen_shannon_distance(corpus_topic_proportions, doc_distribution) distance2cdist_map[file_id] = distance # Create a JSON file for document clusters with the following format: #{ # clusters : { # # <cluster_id> : { # primary_topic : <topic_id>, # primary_doc : <primary_doc_mallet_id>, # distance2cdist : <distance to corpus topic distribution> # linked_docs : [ # [<mallet_id>, <file_name>, <js_distance>] # ... # ] # # } #} clusters_json = { } #print "Creating clusters_json\n===================" for topic_id in topic_clusters.keys(): clusters_json[topic_id] = { "primary_topic" : topic_id, "primary_doc" : top_proportions[topic_id][0], "distance2cdist" : distance2cdist_map[top_proportions[topic_id][0]], "linked_docs" : [] } for mallet_file_id in topic_clusters[topic_id]: int_mfi = int(mallet_file_id) clusters_json[topic_id]["linked_docs"].append([int_mfi, file_topic_proportions[int_mfi].fileid, jsd_buckets[topic_id][mallet_file_id]]) #print "clusters_json[{0}]:\n{1}\n===================".format(topic_id, clusters_json[topic_id]) return clusters_json
def DetermineCorpusClusters_Avg(file_topic_proportions, corpus_topic_proportions): clusters_json = {} topic_count = len(corpus_topic_proportions) file_count = len(file_topic_proportions) # print "============================" # print "DetermineCorpusClusters_Avg" # print "\nTopic Count:{0}\nFile Count: {1}".format(topic_count, file_count) for topic_id in range(topic_count): # print "\nProcessing cluster {0}".format(topic_id) # Clusters have name, dist2avg, topics, and text-level children clusters_json[topic_id] = { "name": topic_id, "children": [] } # Find all texts with this topic as their top topic texts_with_top_topic = [] for index in range(file_count): if topic_id == int(file_topic_proportions[index].sorted_topic_list[0][0]): texts_with_top_topic.append(index) # print "Texts with top topic {0}: {1}".format(topic_id, texts_with_top_topic) # Get the average topic distribution for this cluster cluster_avg_topic_dist = [0 for index in range(topic_count)] for index in range(len(texts_with_top_topic)): for index2 in range(topic_count): cluster_avg_topic_dist[index2] += file_topic_proportions[texts_with_top_topic[index]].topic_guide[str(index2)] for index in range(topic_count): cluster_avg_topic_dist[index] /= topic_count # Get its distance from the corpus distribution clusters_json[topic_id]["dist2avg"] = utils_jensen_shannon.jensen_shannon_distance(corpus_topic_proportions, cluster_avg_topic_dist) # Sort and store the average topic distribution for this cluster clusters_json[topic_id]["topics"] = {} cluster_topic_list = [] for index in range(topic_count): clusters_json[topic_id]["topics"][index] = [0, cluster_avg_topic_dist[index]] cluster_topic_list.append([index, cluster_avg_topic_dist[index]]) cluster_topic_list = sorted(cluster_topic_list, key=lambda x:x[1], reverse=True) for rank in range(len(cluster_topic_list)): clusters_json[topic_id]["topics"][cluster_topic_list[rank][0]][0] = rank + 1 # Now add the text-level children for index in range(len(texts_with_top_topic)): current_ftp = file_topic_proportions[texts_with_top_topic[index]] text_json = { "name": current_ftp.fileid } # Get the ranked topics/topic proportions for this text text_json["topics"] = {topic:[] for topic in range(topic_count)} for ranked_topic_pair_index in range(len(current_ftp.sorted_topic_list)): text_json["topics"][int(current_ftp.sorted_topic_list[ranked_topic_pair_index][0])].append(ranked_topic_pair_index) text_json["topics"][int(current_ftp.sorted_topic_list[ranked_topic_pair_index][0])].append(current_ftp.sorted_topic_list[ranked_topic_pair_index][1]) # Calculate the distance between the cluster's average topic distribution and this text's topic distribution text_topic_distribution = [current_ftp.topic_guide[str(index)] for index in range(len(corpus_topic_proportions))] text_json["dist2avg"] = utils_jensen_shannon.jensen_shannon_distance(cluster_avg_topic_dist, text_topic_distribution) # Add this text to the cluster json clusters_json[topic_id]["children"].append(text_json) # print "============================" return clusters_json