Ejemplo n.º 1
0
def FindClosestVectors(p_test_vector, p_vector_list):

    distance_list = []
    for index in range(len(p_vector_list)):
        distance_list.append(
            jensen_shannon_distance(p_test_vector, p_vector_list[index]))
    return distance_list.index(min(distance_list))
Ejemplo n.º 2
0
def GetClosestSentenceToAbsOfVector(p_vector, p_pmf_objects):

    # Calculate and normalize the average of the first group
    for index in range(len(p_vector)):
        p_vector[index] = abs(p_vector[index])

    distances = []
    for obj in p_pmf_objects:
        current_vector = []
        for index in range(len(spacy_pos_order)):
            current_vector.append(obj.pos()["pos"][spacy_pos_order[index]])
        distances.append(jensen_shannon_distance(p_vector, current_vector))

    return p_pmf_objects[distances.index(min(distances))].sentence()
Ejemplo n.º 3
0
    def Build_CorpusMapJSON(corpus_name, corpus_topics, file_topic_proportions, output_dir):

        '''
          TWiC JSON Hierarchy: Corpus -> Clusters -> Texts
          // Corpus
          {
              "name": <corpus_name>,
              "ideal_text":<file_id>, - Text with average topic distribution
              "distance2ideal":"NA",
              "topics" : {
                  <topic_id> : [<rank>, <topic_proportion>],...
              },
              // Clusters
              "children" : [
                  {
                    "name":<cluster_name>, - Topic number
                    "ideal_text":<file_id>, - Text where topic is strongest
                    "distance2ideal":<jd distance from cluster ideal text to corpus ideal text>
                    "topics" : {
                        <topic_id> : [<rank>, <topic_proportion>],...
                    },
                    // Texts
                    "children":[
                        {
                            "name":<text_name>,
                            "ideal_text":<file_id> - Self
                            "distance2ideal":<jd distance from this text to cluster ideal text>
                            "topics": {
                                <topic_id> : [<rank>, <topic_proportion>],...
                            },
                            "children":[]
                        },...
                    ]
                  },...
              ]
          }
        '''

        # 1. Define corpus level JSON
        twic_corpus_map = {
            "name" : corpus_name,
            "ideal_text" : "",
            "distance2ideal" : "",
            "topics" : {},
            "children" : []
        }

        # Build a ranked map of corpus-level topics for the JSON
        corpus_topic_pairs = [[topic, corpus_topics[topic]] for topic in corpus_topics.keys()]
        sorted_corpus_topic_pairs = sorted(corpus_topic_pairs, key=lambda x:x[1], reverse=True)
        ranked_corpus_topic_map = {}
        # for index in range(0, len(sorted_corpus_topic_pairs)):
        #     ranked_corpus_topic_map[sorted_corpus_topic_pairs[index][0]] = [index + 1, sorted_corpus_topic_pairs[index][1]]
        ranked_corpus_topic_map = { sorted_corpus_topic_pairs[index][0]:
                                    [index + 1, sorted_corpus_topic_pairs[index][1]]
                                    for index in range(len(sorted_corpus_topic_pairs))}
        twic_corpus_map["topics"] = ranked_corpus_topic_map

        '''# Determine average topic distribution for corpus
        doc_count = float(len(file_topic_proportions))
        topic_count = len(corpus_topic_proportions.keys())
        corpus_proportion_sums = {}
        avg_corpus_distribution = {}
        for doc in file_topic_proportions:
            for topic_id in topic_guide.keys():
                if topic_id not in corpus_proportion_sums.keys():
                    corpus_proportion_sums[topic_id] = 0.0
                corpus_proportion_sums[topic_id] += doc.topic_guide[topic_id]
        for topic_id in corpus_proportion_sums.keys():
            avg_corpus_distribution[topic_id] = corpus_proportion_sums[topic_id] / doc_count
        '''

        # Get corpus topic distribution sorted by topic id
        # corpus_topic_proportions = []
        # for index in range(0, len(corpus_topics.keys())):
        #    corpus_topic_proportions.append(corpus_topics[str(index)])
        corpus_topic_proportions = [corpus_topics[str(index)] for index in range(len(corpus_topics.keys()))]

        # Find the document whose distribution is closest to that corpus topic distribution for the corpus
        distances_to_ideal = []
        for doc in file_topic_proportions:
            doc_topics = sorted(doc.sorted_topic_list, key=lambda x:x[0], reverse=False)
            doc_distribution = []
            for index in range(0, len(doc_topics)):
                doc_distribution.append(0.0)
            for index in range(0, len(doc_topics)):
                int_topic_id = int(doc_topics[index][0])
                doc_distribution[int_topic_id] = doc_topics[index][1]
            #print 'Doc distr:{0}\nCorp distr:{1}'.format(doc_distribution, corpus_topic_proportions)
            distances_to_ideal.append([doc.id, utils_jensen_shannon.jensen_shannon_distance(corpus_topic_proportions, doc_distribution)])
        distances_to_ideal = sorted(distances_to_ideal, key=lambda x:x[1], reverse=False)

        # Save the text closest to that average distribution and its distance from that average
        twic_corpus_map["ideal_text"] = distances_to_ideal[0][0]
        twic_corpus_map["distance2ideal"] = distances_to_ideal[0][1]

        # 2. Now work on defining the cluster level JSON
        # cluster_distance_file = open(output_dir + cluster_distance_filename, 'r')
        # cluster_distance_data = json.load(cluster_distance_file)

        # Determine topic clusters of the corpus
        clusters_json = TWiC_MalletInterpret.DetermineCorpusClusters(file_topic_proportions, corpus_topic_proportions)
        for cluster_topic_id in clusters_json.keys():

            # Define a new cluster child of the corpus map
            current_cluster_index = len(twic_corpus_map["children"])

            # Find the ideal file in the file topic proportion collection
            file_index = -1
            for index in range(len(file_topic_proportions)):
                if file_topic_proportions[index].id == clusters_json[cluster_topic_id]["primary_doc"]:
                    file_index = index
                    break
            if -1 == file_index:
                print 'Could not find primary doc {0} for cluster {1} in ftp collection. Skipping cluster.'.format(clusters_json[cluster_topic_id]["primary_doc"],
                                                                                                                   cluster_topic_id)
                continue

            # Build a ranked list of the cluster topics for the ideal text of this cluster
            # cluster_topic_pairs = []
            # for topic in file_topic_proportions[file_index].topic_guide:
            #     cluster_topic_pairs.append([topic,file_topic_proportions[file_index].topic_guide[topic]])
            cluster_topic_pairs = [[topic, file_topic_proportions[file_index].topic_guide[topic]]
                                   for topic in file_topic_proportions[file_index].topic_guide]
            sorted_cluster_topic_pairs = sorted(cluster_topic_pairs, key=lambda x:x[1], reverse=True)
            # ranked_cluster_topic_map = {}
            # for index in range(0, len(sorted_cluster_topic_pairs)):
            #     ranked_cluster_topic_map[sorted_cluster_topic_pairs[index][0]] = [index + 1, sorted_cluster_topic_pairs[index][1]]
            ranked_cluster_topic_map = { sorted_cluster_topic_pairs[index][0]:
                                         [index + 1, sorted_cluster_topic_pairs[index][1]]
                                         for index in range(len(sorted_cluster_topic_pairs))}

            # Define each cluster level JSON entry
            twic_corpus_map["children"].append({
                "name" : cluster_topic_id,
                "ideal_text" : clusters_json[cluster_topic_id]["primary_doc"],
                "distance2ideal" : clusters_json[cluster_topic_id]["distance2cdist"],
                "topics" : ranked_cluster_topic_map,
                "children" : []
            })

            # 3. Work on the text level JSON
            my_text_index = -1
            for entry in clusters_json[cluster_topic_id]["linked_docs"]:

                for index in range(len(file_topic_proportions)):
                    if file_topic_proportions[index].fileid == entry[1]:
                        my_text_index = index
                        break

                twic_corpus_map["children"][current_cluster_index]["children"].append({
                        "name" : entry[1],
                        "ideal_text" : entry[0],
                        "distance2ideal" : entry[2],
                        "topics" : file_topic_proportions[my_text_index].topic_guide,
                        "children" : []
                    })

        # 4. Write out corpus map to JSON
        with open(output_dir + 'twic_corpusmap.json','w') as output_file:
            output_file.write(json.dumps(twic_corpus_map))
Ejemplo n.º 4
0
    def DetermineCorpusClusters_Avg(file_topic_proportions, corpus_topic_proportions):

        clusters_json = {}
        topic_count = len(corpus_topic_proportions)
        file_count = len(file_topic_proportions)

        # print "============================"
        # print "DetermineCorpusClusters_Avg"
        # print "\nTopic Count:{0}\nFile Count: {1}".format(topic_count, file_count)

        for topic_id in range(topic_count):

            # print "\nProcessing cluster {0}".format(topic_id)

            # Clusters have name, dist2avg, topics, and text-level children
            clusters_json[topic_id] = {
                "name": topic_id,
                "children": []
            }

            # Find all texts with this topic as their top topic
            texts_with_top_topic = []
            for index in range(file_count):
                if topic_id == int(file_topic_proportions[index].sorted_topic_list[0][0]):
                    texts_with_top_topic.append(index)

            # print "Texts with top topic {0}: {1}".format(topic_id, texts_with_top_topic)

            # Get the average topic distribution for this cluster
            cluster_avg_topic_dist = [0 for index in range(topic_count)]
            for index in range(len(texts_with_top_topic)):
                for index2 in range(topic_count):
                    #print "TOPIC GUIDE:\n{0}".format(file_topic_proportions[texts_with_top_topic[index]].topic_guide)
                    cluster_avg_topic_dist[index2] += file_topic_proportions[texts_with_top_topic[index]].topic_guide[str(index2)]
            for index in range(topic_count):
                cluster_avg_topic_dist[index] /= topic_count

            # Get its distance from the corpus distribution
            clusters_json[topic_id]["dist2avg"] = utils_jensen_shannon.jensen_shannon_distance(corpus_topic_proportions, cluster_avg_topic_dist)

            # Sort and store the average topic distribution for this cluster
            clusters_json[topic_id]["topics"] = {}
            cluster_topic_list = []
            for index in range(topic_count):
                clusters_json[topic_id]["topics"][index] = [0, cluster_avg_topic_dist[index]]
                cluster_topic_list.append([index, cluster_avg_topic_dist[index]])
            cluster_topic_list = sorted(cluster_topic_list, key=lambda x:x[1], reverse=True)
            for rank in range(len(cluster_topic_list)):
                clusters_json[topic_id]["topics"][cluster_topic_list[rank][0]][0] = rank + 1

            # Now add the text-level children
            for index in range(len(texts_with_top_topic)):

                current_ftp = file_topic_proportions[texts_with_top_topic[index]]
                text_json = { "name": current_ftp.fileid }

                # Get the ranked topics/topic proportions for this text
                text_json["topics"] = {topic:[] for topic in range(topic_count)}
                for ranked_topic_pair_index in range(len(current_ftp.sorted_topic_list)):
                    text_json["topics"][int(current_ftp.sorted_topic_list[ranked_topic_pair_index][0])].append(ranked_topic_pair_index)
                    text_json["topics"][int(current_ftp.sorted_topic_list[ranked_topic_pair_index][0])].append(current_ftp.sorted_topic_list[ranked_topic_pair_index][1])

                # Calculate the distance between the cluster's average topic distribution and this text's topic distribution
                text_topic_distribution = [current_ftp.topic_guide[str(index)] for index in range(len(corpus_topic_proportions))]
                text_json["dist2avg"] = utils_jensen_shannon.jensen_shannon_distance(cluster_avg_topic_dist, text_topic_distribution)

                # Add this text to the cluster json
                clusters_json[topic_id]["children"].append(text_json)


        # print "============================"

        return clusters_json
Ejemplo n.º 5
0
    def DetermineCorpusClusters(file_topic_proportions, corpus_topic_proportions):

        # Clustering task

        # 1. Determine which files contain the top proportion of a topic
        #    a. Those files' overall topic proportion composition become a standard around which we can
        #       base probability distribution (topic proportion composition of other texts) comparisons
        # 2. So now we have N distributions representative of N topics.
        #    a. For each file/topic proportion composition we want to compare each other file's topic proportion
        #       composition.  This renders a comparison of N files * N files, a guaranteed O(n^2) comparison -
        #       really O(n^2) - n.
        #    b. Optimization question - Can we trim further than O(n^2) - n?
        #    c. Once all comparisons are done we have an N * N array of probability distribution distances.
        # 3. Clustering parametrization becomes a question.
        #    a. How many documents should cluster toward the topic proportion composition standard?
        #       But is this really a question?  Let's look at clustering algorithms...
        #    b. Take the smallest distance for each file and assign it to a list of size N, representing the list
        #       of potential clusters (and also, it happens topics).

        # What we have - A list of file-topic proportion objects (for each file) which contain:
        #    a. MALLET-assigned ID
        #    b. File ID sans path and sans extension
        #    c. Full filepath
        #    d. Topic guide which matches topic id to proportion
        #    e. A list of (topic,topic proportion) pairs sorted by proportion in descending order

        # 1.

        # Index of list will match topic IDs
        top_proportions = {}
        for doc in file_topic_proportions:
            for topic_id in doc.topic_guide.keys():
                int_topic_id = int(topic_id)
                topic_proportion = doc.topic_guide[topic_id]
                if int_topic_id not in top_proportions.keys():
                    top_proportions[int_topic_id] = [doc.id, topic_proportion]
                else:
                    if top_proportions[int_topic_id][1] < topic_proportion:
                        top_proportions[int_topic_id][0] = doc.id
                        top_proportions[int_topic_id][1] = topic_proportion

        # 2.

        # Build a list of lists of Jensen-Shannon distances for each ideal distribution
        jsd_buckets = { key : {} for key in top_proportions.keys() }
        # for key in top_proportions.keys():
        #    jsd_buckets[key] = {}

        # Get a list of distributions for all files (Mallet file ID mapped to the full distribution)
        prob_distributions = {}
        for doc in file_topic_proportions:
            distribution = [doc.topic_guide[str(index)] for index in range(len(top_proportions))]
            # for index in range(0, len(top_proportions)):
            #    distribution.append(doc.topic_guide[str(index)])
            prob_distributions[doc.id] = distribution

        # Build a list of JSD distances compared to that distribution for every other file
        for key in top_proportions.keys():

            top_file_probdistr = prob_distributions[top_proportions[key][0]]
            #print 'Topic {0} File ID: {1} Top Distribution: {2}'.format(key, top_proportions[key][0], prob_distributions[top_proportions[key][0]])

            for doc in file_topic_proportions:
                if doc.id == top_proportions[key][0]:
                    jsd_buckets[key][doc.id] = 0
                else:
                    jsd_buckets[key][doc.id] = utils_jensen_shannon.jensen_shannon_distance(top_file_probdistr, prob_distributions[doc.id])

        # 3.

        # MALLET file ids will be assigned to the cluster buckets, keyed by topic id

        # Create a smallest distance list
        smallest_distances = {}
        for doc in file_topic_proportions:
            distances = []
            for topic_id in jsd_buckets.keys():
                #if ftp.id in jsd_buckets[topic_id].keys():
                distances.append([topic_id, jsd_buckets[topic_id][doc.id]])
            distances = sorted(distances, key=lambda x:x[1], reverse=False)
            #print 'Distances for {0}: {1}'.format(ftp.id, distances)
            smallest_distances[doc.id] = distances[0][0]

        topic_clusters = {}
        for topic_id in jsd_buckets.keys():
            topic_clusters[topic_id] = []
            for doc in file_topic_proportions:
                if topic_id == smallest_distances[doc.id]:
                    topic_clusters[topic_id].append(doc.id)

        file_count = 0
        for key in topic_clusters.keys():
            file_count += len(topic_clusters[key])
            #print 'Topic {0}, Corpus Proportion: {1} Length of Cluster list: {2}'.format(key, ctp_index[key], len(topic_clusters[key]))

        #print 'File count in cluster map: {0}'.format(file_count)

        distance2cdist_map = {}
        for topic_id in top_proportions.keys():
            file_id = top_proportions[int(topic_id)][0]
            doc_distribution = prob_distributions[file_id]
            #print 'Doc Dist Len: {0}\nDoc Dist: {1}'.format(len(doc_distribution), doc_distribution)
            distance = utils_jensen_shannon.jensen_shannon_distance(corpus_topic_proportions, doc_distribution)
            distance2cdist_map[file_id] = distance

        # Create a JSON file for document clusters with the following format:
        #{
        #    clusters : {
        #
        #        <cluster_id> : {
        #        primary_topic : <topic_id>,
        #        primary_doc : <primary_doc_mallet_id>,
        #        distance2cdist : <distance to corpus topic distribution>
        #        linked_docs : [
        #            [<mallet_id>, <file_name>, <js_distance>]
        #            ...
        #        ]
        #
        #    }
        #}
        clusters_json = { }
        #print "Creating clusters_json\n==================="
        for topic_id in topic_clusters.keys():
            clusters_json[topic_id] = { "primary_topic" : topic_id,
                                        "primary_doc" : top_proportions[topic_id][0],
                                        "distance2cdist" : distance2cdist_map[top_proportions[topic_id][0]],
                                        "linked_docs" : [] }
            for mallet_file_id in topic_clusters[topic_id]:
                int_mfi = int(mallet_file_id)
                clusters_json[topic_id]["linked_docs"].append([int_mfi,
                                                              file_topic_proportions[int_mfi].fileid,
                                                              jsd_buckets[topic_id][mallet_file_id]])
            #print "clusters_json[{0}]:\n{1}\n===================".format(topic_id, clusters_json[topic_id])

        return clusters_json
Ejemplo n.º 6
0
    def Build_CorpusMapJSON(corpus_name, corpus_topics, file_topic_proportions, output_dir):

        '''
          TWiC JSON Hierarchy: Corpus -> Clusters -> Texts
          // Corpus
          {
              "name": <corpus_name>,
              "ideal_text":<file_id>, - Text with average topic distribution
              "distance2ideal":"NA",
              "topics" : {
                  <topic_id> : [<rank>, <topic_proportion>],...
              },
              // Clusters
              "children" : [
                  {
                    "name":<cluster_name>, - Topic number
                    "ideal_text":<file_id>, - Text where topic is strongest
                    "distance2ideal":<jd distance from cluster ideal text to corpus ideal text>
                    "topics" : {
                        <topic_id> : [<rank>, <topic_proportion>],...
                    },
                    // Texts
                    "children":[
                        {
                            "name":<text_name>,
                            "ideal_text":<file_id> - Self
                            "distance2ideal":<jd distance from this text to cluster ideal text>
                            "topics": {
                                <topic_id> : [<rank>, <topic_proportion>],...
                            },
                            "children":[]
                        },...
                    ]
                  },...
              ]
          }
        '''

        # 1. Define corpus level JSON
        twic_corpus_map = {
            "name" : corpus_name,
            "ideal_text" : "",
            "distance2ideal" : "",
            "topics" : {},
            "children" : []
        }

        # Build a ranked map of corpus-level topics for the JSON
        corpus_topic_pairs = [[topic, corpus_topics[topic]] for topic in corpus_topics.keys()]
        sorted_corpus_topic_pairs = sorted(corpus_topic_pairs, key=lambda x:x[1], reverse=True)
        ranked_corpus_topic_map = {}
        # for index in range(0, len(sorted_corpus_topic_pairs)):
        #     ranked_corpus_topic_map[sorted_corpus_topic_pairs[index][0]] = [index + 1, sorted_corpus_topic_pairs[index][1]]
        ranked_corpus_topic_map = { sorted_corpus_topic_pairs[index][0]:
                                    [index + 1, sorted_corpus_topic_pairs[index][1]]
                                    for index in range(len(sorted_corpus_topic_pairs))}
        twic_corpus_map["topics"] = ranked_corpus_topic_map

        '''# Determine average topic distribution for corpus
        doc_count = float(len(file_topic_proportions))
        topic_count = len(corpus_topic_proportions.keys())
        corpus_proportion_sums = {}
        avg_corpus_distribution = {}
        for doc in file_topic_proportions:
            for topic_id in topic_guide.keys():
                if topic_id not in corpus_proportion_sums.keys():
                    corpus_proportion_sums[topic_id] = 0.0
                corpus_proportion_sums[topic_id] += doc.topic_guide[topic_id]
        for topic_id in corpus_proportion_sums.keys():
            avg_corpus_distribution[topic_id] = corpus_proportion_sums[topic_id] / doc_count
        '''

        # Get corpus topic distribution sorted by topic id
        # corpus_topic_proportions = []
        # for index in range(0, len(corpus_topics.keys())):
        #    corpus_topic_proportions.append(corpus_topics[str(index)])
        corpus_topic_proportions = [corpus_topics[str(index)] for index in range(len(corpus_topics.keys()))]

        # Find the document whose distribution is closest to that corpus topic distribution for the corpus
        distances_to_ideal = []
        for doc in file_topic_proportions:
            doc_topics = sorted(doc.sorted_topic_list, key=lambda x:x[0], reverse=False)
            doc_distribution = []
            for index in range(0, len(doc_topics)):
                doc_distribution.append(0.0)
            for index in range(0, len(doc_topics)):
                int_topic_id = int(doc_topics[index][0])
                doc_distribution[int_topic_id] = doc_topics[index][1]
            #print 'Doc distr:{0}\nCorp distr:{1}'.format(doc_distribution, corpus_topic_proportions)
            distances_to_ideal.append([doc.id, utils_jensen_shannon.jensen_shannon_distance(corpus_topic_proportions, doc_distribution)])
        distances_to_ideal = sorted(distances_to_ideal, key=lambda x:x[1], reverse=False)

        # Save the text closest to that average distribution and its distance from that average
        twic_corpus_map["ideal_text"] = distances_to_ideal[0][0]
        twic_corpus_map["distance2ideal"] = distances_to_ideal[0][1]

        # 2. Now work on defining the cluster level JSON
        # cluster_distance_file = open(output_dir + cluster_distance_filename, 'r')
        # cluster_distance_data = json.load(cluster_distance_file)

        # Determine topic clusters of the corpus
        clusters_json = TWiC_MalletInterpret.DetermineCorpusClusters(file_topic_proportions, corpus_topic_proportions)
        for cluster_topic_id in clusters_json.keys():

            # Define a new cluster child of the corpus map
            current_cluster_index = len(twic_corpus_map["children"])

            # Find the ideal file in the file topic proportion collection
            file_index = -1
            for index in range(len(file_topic_proportions)):
                if file_topic_proportions[index].id == clusters_json[cluster_topic_id]["primary_doc"]:
                    file_index = index
                    break
            if -1 == file_index:
                print 'Could not find primary doc {0} for cluster {1} in ftp collection. Skipping cluster.'.format(clusters_json[cluster_topic_id]["primary_doc"],
                                                                                                                   cluster_topic_id)
                continue

            # Build a ranked list of the cluster topics for the ideal text of this cluster
            # cluster_topic_pairs = []
            # for topic in file_topic_proportions[file_index].topic_guide:
            #     cluster_topic_pairs.append([topic,file_topic_proportions[file_index].topic_guide[topic]])
            cluster_topic_pairs = [[topic, file_topic_proportions[file_index].topic_guide[topic]]
                                   for topic in file_topic_proportions[file_index].topic_guide]
            sorted_cluster_topic_pairs = sorted(cluster_topic_pairs, key=lambda x:x[1], reverse=True)
            # ranked_cluster_topic_map = {}
            # for index in range(0, len(sorted_cluster_topic_pairs)):
            #     ranked_cluster_topic_map[sorted_cluster_topic_pairs[index][0]] = [index + 1, sorted_cluster_topic_pairs[index][1]]
            ranked_cluster_topic_map = { sorted_cluster_topic_pairs[index][0]:
                                         [index + 1, sorted_cluster_topic_pairs[index][1]]
                                         for index in range(len(sorted_cluster_topic_pairs))}

            # Define each cluster level JSON entry
            twic_corpus_map["children"].append({
                "name" : cluster_topic_id,
                "ideal_text" : clusters_json[cluster_topic_id]["primary_doc"],
                "distance2ideal" : clusters_json[cluster_topic_id]["distance2cdist"],
                "topics" : ranked_cluster_topic_map,
                "children" : []
            })

            # 3. Work on the text level JSON
            my_text_index = -1
            for entry in clusters_json[cluster_topic_id]["linked_docs"]:

                for index in range(len(file_topic_proportions)):
                    if file_topic_proportions[index].fileid == entry[1]:
                        my_text_index = index
                        break

                twic_corpus_map["children"][current_cluster_index]["children"].append({
                        "name" : entry[1],
                        "ideal_text" : entry[0],
                        "distance2ideal" : entry[2],
                        "topics" : file_topic_proportions[my_text_index].topic_guide,
                        "children" : []
                    })

        # 4. Write out corpus map to JSON
        with open(output_dir + 'twic_corpusmap.json','w') as output_file:
            output_file.write(json.dumps(twic_corpus_map))
Ejemplo n.º 7
0
    def DetermineCorpusClusters_Avg(file_topic_proportions, corpus_topic_proportions):

        clusters_json = {}
        topic_count = len(corpus_topic_proportions)
        file_count = len(file_topic_proportions)

        # print "============================"
        # print "DetermineCorpusClusters_Avg"
        # print "\nTopic Count:{0}\nFile Count: {1}".format(topic_count, file_count)

        for topic_id in range(topic_count):

            # print "\nProcessing cluster {0}".format(topic_id)

            # Clusters have name, dist2avg, topics, and text-level children
            clusters_json[topic_id] = {
                "name": topic_id,
                "children": []
            }

            # Find all texts with this topic as their top topic
            texts_with_top_topic = []
            for index in range(file_count):
                if topic_id == int(file_topic_proportions[index].sorted_topic_list[0][0]):
                    texts_with_top_topic.append(index)

            # print "Texts with top topic {0}: {1}".format(topic_id, texts_with_top_topic)

            # Get the average topic distribution for this cluster
            cluster_avg_topic_dist = [0 for index in range(topic_count)]
            for index in range(len(texts_with_top_topic)):
                for index2 in range(topic_count):
                    cluster_avg_topic_dist[index2] += file_topic_proportions[texts_with_top_topic[index]].topic_guide[str(index2)]
            for index in range(topic_count):
                cluster_avg_topic_dist[index] /= topic_count

            # Get its distance from the corpus distribution
            clusters_json[topic_id]["dist2avg"] = utils_jensen_shannon.jensen_shannon_distance(corpus_topic_proportions, cluster_avg_topic_dist)

            # Sort and store the average topic distribution for this cluster
            clusters_json[topic_id]["topics"] = {}
            cluster_topic_list = []
            for index in range(topic_count):
                clusters_json[topic_id]["topics"][index] = [0, cluster_avg_topic_dist[index]]
                cluster_topic_list.append([index, cluster_avg_topic_dist[index]])
            cluster_topic_list = sorted(cluster_topic_list, key=lambda x:x[1], reverse=True)
            for rank in range(len(cluster_topic_list)):
                clusters_json[topic_id]["topics"][cluster_topic_list[rank][0]][0] = rank + 1

            # Now add the text-level children
            for index in range(len(texts_with_top_topic)):

                current_ftp = file_topic_proportions[texts_with_top_topic[index]]
                text_json = { "name": current_ftp.fileid }

                # Get the ranked topics/topic proportions for this text
                text_json["topics"] = {topic:[] for topic in range(topic_count)}
                for ranked_topic_pair_index in range(len(current_ftp.sorted_topic_list)):
                    text_json["topics"][int(current_ftp.sorted_topic_list[ranked_topic_pair_index][0])].append(ranked_topic_pair_index)
                    text_json["topics"][int(current_ftp.sorted_topic_list[ranked_topic_pair_index][0])].append(current_ftp.sorted_topic_list[ranked_topic_pair_index][1])

                # Calculate the distance between the cluster's average topic distribution and this text's topic distribution
                text_topic_distribution = [current_ftp.topic_guide[str(index)] for index in range(len(corpus_topic_proportions))]
                text_json["dist2avg"] = utils_jensen_shannon.jensen_shannon_distance(cluster_avg_topic_dist, text_topic_distribution)

                # Add this text to the cluster json
                clusters_json[topic_id]["children"].append(text_json)


        # print "============================"

        return clusters_json
Ejemplo n.º 8
0
    def DetermineCorpusClusters(file_topic_proportions, corpus_topic_proportions):

        # Clustering task

        # 1. Determine which files contain the top proportion of a topic
        #    a. Those files' overall topic proportion composition become a standard around which we can
        #       base probability distribution (topic proportion composition of other texts) comparisons
        # 2. So now we have N distributions representative of N topics.
        #    a. For each file/topic proportion composition we want to compare each other file's topic proportion
        #       composition.  This renders a comparison of N files * N files, a guaranteed O(n^2) comparison -
        #       really O(n^2) - n.
        #    b. Optimization question - Can we trim further than O(n^2) - n?
        #    c. Once all comparisons are done we have an N * N array of probability distribution distances.
        # 3. Clustering parametrization becomes a question.
        #    a. How many documents should cluster toward the topic proportion composition standard?
        #       But is this really a question?  Let's look at clustering algorithms...
        #    b. Take the smallest distance for each file and assign it to a list of size N, representing the list
        #       of potential clusters (and also, it happens topics).

        # What we have - A list of file-topic proportion objects (for each file) which contain:
        #    a. MALLET-assigned ID
        #    b. File ID sans path and sans extension
        #    c. Full filepath
        #    d. Topic guide which matches topic id to proportion
        #    e. A list of (topic,topic proportion) pairs sorted by proportion in descending order

        # 1.

        # Index of list will match topic IDs
        top_proportions = {}
        for doc in file_topic_proportions:
            for topic_id in doc.topic_guide.keys():
                int_topic_id = int(topic_id)
                topic_proportion = doc.topic_guide[topic_id]
                if int_topic_id not in top_proportions.keys():
                    top_proportions[int_topic_id] = [doc.id, topic_proportion]
                else:
                    if top_proportions[int_topic_id][1] < topic_proportion:
                        top_proportions[int_topic_id][0] = doc.id
                        top_proportions[int_topic_id][1] = topic_proportion

        # 2.

        # Build a list of lists of Jensen-Shannon distances for each ideal distribution
        jsd_buckets = { key : {} for key in top_proportions.keys() }
        # for key in top_proportions.keys():
        #    jsd_buckets[key] = {}

        # Get a list of distributions for all files (Mallet file ID mapped to the full distribution)
        prob_distributions = {}
        for doc in file_topic_proportions:
            distribution = [doc.topic_guide[str(index)] for index in range(len(top_proportions))]
            # for index in range(0, len(top_proportions)):
            #    distribution.append(doc.topic_guide[str(index)])
            prob_distributions[doc.id] = distribution

        # Build a list of JSD distances compared to that distribution for every other file
        for key in top_proportions.keys():

            top_file_probdistr = prob_distributions[top_proportions[key][0]]
            #print 'Topic {0} File ID: {1} Top Distribution: {2}'.format(key, top_proportions[key][0], prob_distributions[top_proportions[key][0]])

            for doc in file_topic_proportions:
                if doc.id == top_proportions[key][0]:
                    jsd_buckets[key][doc.id] = 0
                else:
                    jsd_buckets[key][doc.id] = utils_jensen_shannon.jensen_shannon_distance(top_file_probdistr, prob_distributions[doc.id])

        # 3.

        # MALLET file ids will be assigned to the cluster buckets, keyed by topic id

        # Create a smallest distance list
        smallest_distances = {}
        for doc in file_topic_proportions:
            distances = []
            for topic_id in jsd_buckets.keys():
                #if ftp.id in jsd_buckets[topic_id].keys():
                distances.append([topic_id, jsd_buckets[topic_id][doc.id]])
            distances = sorted(distances, key=lambda x:x[1], reverse=False)
            #print 'Distances for {0}: {1}'.format(ftp.id, distances)
            smallest_distances[doc.id] = distances[0][0]

        topic_clusters = {}
        for topic_id in jsd_buckets.keys():
            topic_clusters[topic_id] = []
            for doc in file_topic_proportions:
                if topic_id == smallest_distances[doc.id]:
                    topic_clusters[topic_id].append(doc.id)

        file_count = 0
        for key in topic_clusters.keys():
            file_count += len(topic_clusters[key])
            #print 'Topic {0}, Corpus Proportion: {1} Length of Cluster list: {2}'.format(key, ctp_index[key], len(topic_clusters[key]))

        #print 'File count in cluster map: {0}'.format(file_count)

        distance2cdist_map = {}
        for topic_id in top_proportions.keys():
            file_id = top_proportions[int(topic_id)][0]
            doc_distribution = prob_distributions[file_id]
            #print 'Doc Dist Len: {0}\nDoc Dist: {1}'.format(len(doc_distribution), doc_distribution)
            distance = utils_jensen_shannon.jensen_shannon_distance(corpus_topic_proportions, doc_distribution)
            distance2cdist_map[file_id] = distance

        # Create a JSON file for document clusters with the following format:
        #{
        #    clusters : {
        #
        #        <cluster_id> : {
        #        primary_topic : <topic_id>,
        #        primary_doc : <primary_doc_mallet_id>,
        #        distance2cdist : <distance to corpus topic distribution>
        #        linked_docs : [
        #            [<mallet_id>, <file_name>, <js_distance>]
        #            ...
        #        ]
        #
        #    }
        #}
        clusters_json = { }
        #print "Creating clusters_json\n==================="
        for topic_id in topic_clusters.keys():
            clusters_json[topic_id] = { "primary_topic" : topic_id,
                                        "primary_doc" : top_proportions[topic_id][0],
                                        "distance2cdist" : distance2cdist_map[top_proportions[topic_id][0]],
                                        "linked_docs" : [] }
            for mallet_file_id in topic_clusters[topic_id]:
                int_mfi = int(mallet_file_id)
                clusters_json[topic_id]["linked_docs"].append([int_mfi,
                                                              file_topic_proportions[int_mfi].fileid,
                                                              jsd_buckets[topic_id][mallet_file_id]])
            #print "clusters_json[{0}]:\n{1}\n===================".format(topic_id, clusters_json[topic_id])

        return clusters_json