Exemple #1
0
    def __init__(self, snippets=[]):
        """
        Args:
            snippets - list of strings where every element is a news snippet -
            not required. You can just use it without parametrs:
            STC = SuffixTreeClustering()
            STC.add_strings(snippet)
        """
        self.snippets = snippets
        self.final_phrases = {}
        self.cluster_document = {}  #base cluster -> documents it covers
        self.phrases = {}  #phrases for each base cluster
        self.scores = {}  #scores for base clusters
        self.sorted_clusters = []  #sorted base-clusters by the scores
        self.final_clusters = []  #final merged clusters
        self.top_final_clusters = []  #top n final clusters

        self.suffix_tree = SuffixTree()
        if len(snippets) > 0:
            self.add_strings(snippets)
Exemple #2
0
class SuffixTreeClustering:
    """
    Class for suffix tree clustering
    """
    def __init__(self, snippets=[]):
        """
        Args:
            snippets - list of strings where every element is a news snippet -
            not required. You can just use it without parametrs:
            STC = SuffixTreeClustering()
            STC.add_strings(snippet)
        """
        self.snippets = snippets
        self.final_phrases = {}
        self.cluster_document = {}  #base cluster -> documents it covers
        self.phrases = {}  #phrases for each base cluster
        self.scores = {}  #scores for base clusters
        self.sorted_clusters = []  #sorted base-clusters by the scores
        self.final_clusters = []  #final merged clusters
        self.top_final_clusters = []  #top n final clusters

        self.suffix_tree = SuffixTree()
        if len(snippets) > 0:
            self.add_strings(snippets)

    def add_strings(self, strings):
        """
        strings - strings (snippets) to add to suffix tree 
        """
        for string in strings:
            if string is not None:
                self.suffix_tree.append_string(tokenize_and_stem(string))
        self.suffix_tree.fix_input_string()

    def find_base_clusters(self, node=None):
        """
        Find base clusters, recursive
        """
        if node is None:
            node = self.suffix_tree.root

        if (len(node.edges.values()) > 0):
            for edge in node.edges.keys():
                child = node.edges[edge]
                self.find_base_clusters(child)

                #if the child is a cluster - the parent is a cluster too
                if self.cluster_document.get(child.identifier) != None and (
                        child.parent != self.suffix_tree.root):
                    #clusters.append(child.parent.identifier)
                    if self.phrases.get(child.parent.identifier) is None:
                        self.phrases[
                            child.parent.identifier] = child.parent.phrase
                    #if child.parent.edge
                    if self.cluster_document.get(
                            child.parent.identifier) == None:
                        self.cluster_document[
                            child.parent.identifier] = self.cluster_document[
                                child.identifier][:]
                    else:
                        self.cluster_document[
                            child.parent.identifier] += self.cluster_document[
                                child.identifier]

        else:
            if node.parent != self.suffix_tree.root:
                #clusters.append(node.parent.identifier)
                if self.phrases.get(node.parent.identifier) is None:
                    self.phrases[node.parent.identifier] = node.parent.phrase
                if self.cluster_document.get(node.parent.identifier) == None:
                    temp = []
                    temp.append(node.bit_vector)
                    self.cluster_document[node.parent.identifier] = temp[:]
                else:
                    self.cluster_document[node.parent.identifier].append(
                        node.bit_vector)
        return

    def count_scores(self):
        """
        Count scores for base clusters
        Formula: Score(S) = |B| F(|P|),
        where |B| is the size of the cluster (number of covered documents),
        |P| is the number of words in the phrase
           """

        for cluster in self.phrases.keys():
            self.scores[cluster] = len(self.cluster_document[cluster]) * F(
                len(self.phrases[cluster].split(' ')))
        return

    def similarity(self, base_clusters):
        """
        Compute Similarity Matrix
        Args:
            base_clusters - top (<= k = 500) sorted by score base clusters
        Return:
            Similarity Matrix of clusters
        """
        Sim = [[0 for x in range(len(base_clusters))]
               for x in range(len(base_clusters))]

        for i in range(len(base_clusters) - 1):
            Sim[i][i] = 1
            for j in range(i + 1, len(base_clusters)):
                B1 = self.cluster_document[base_clusters[i]]
                B2 = self.cluster_document[base_clusters[j]]
                intersec = set(B1).intersection(
                    B2
                )  # intersection of two clusters (common covered documents)
                if len(intersec) / len(B1) > ALPHA and len(intersec) / len(
                        B2) > ALPHA:
                    Sim[i][j] = 1
                    Sim[j][i] = 1  #not important
        return Sim

    def merge_clusters(self, Sim):
        """
        Merging base clusters
        Args:
            Sim - matrix of similarity between base clusters
        """

        node_names = {}  # dictionary ["name of base cluster"] = GraphNode
        for i in range(len(Sim)):
            if self.sorted_clusters[i] not in node_names.keys():
                node = GraphNode(self.sorted_clusters[i])
                node_names[self.sorted_clusters[i]] = node
            else:
                node = node_names[self.sorted_clusters[i]]
            for j in range(
                    i + 1, len(Sim)
            ):  # efficency: checking only further clusters, ignoring previous
                if Sim[i][j] == 1:
                    if self.sorted_clusters[j] not in node_names.keys():
                        new_node = GraphNode(self.sorted_clusters[j])
                        node_names[self.sorted_clusters[j]] = new_node
                    node_names[self.sorted_clusters[i]].add_link(
                        node_names[self.sorted_clusters[j]])
        number = 1
        for components in connected_components(node_names.values()):
            names = sorted(node.name for node in components)
            self.final_clusters.append(names)
            number += 1

    def find_clusters(self, number_of_clusters=NUM_OF_FINAL_CLUSTERS):
        """
        Findning final clusters
        Args:
            number_of_clusters - max number of final clusters
            default number of clusters = NUM_OF_FINAL_CLUSTERS = 10
        """
        if len(self.snippets) < number_of_clusters:
            print(
                "Sorry, but number of snippets should be >= number of clusters"
            )
            return {}

        self.sorted_clusters = []
        self.clusters_document = {}
        self.find_base_clusters()
        self.count_scores()  # computing scores of each base claster
        self.final_clusters = []
        self.final_phrases = {}
        # sorting base clusters by scores
        sorted_scores = sorted(self.scores.items(),
                               key=operator.itemgetter(1),
                               reverse=1)
        #print(len(sorted_scores))
        n = min(
            K,
            len(sorted_scores))  # number of selected top scored base clusters

        #selecting
        for i in range(n):
            self.sorted_clusters.append(sorted_scores[i][0])
        # computing Similarity matrix for selected clusters
        Sim = self.similarity(self.sorted_clusters)

        self.merge_clusters(Sim)
        # final clusters - result of merging

        # computing final scores for final clusters
        final_scores = {}

        for final_cluster_index in range(len(self.final_clusters)):
            sum = 0
            for base_cluster in self.final_clusters[final_cluster_index]:
                if type(base_cluster) is list:
                    for cluster in base_cluster:
                        sum += self.scores[cluster]
                else:
                    sum += self.scores[base_cluster]
                if final_cluster_index not in self.final_phrases:
                    self.final_phrases[final_cluster_index] = []

                if type(base_cluster) is list:
                    for cluster in base_cluster:
                        self.final_phrases[final_cluster_index].append(
                            self.phrases[cluster])
                else:
                    self.final_phrases[final_cluster_index].append(
                        self.phrases[base_cluster])

            final_scores[final_cluster_index] = sum

        sorted_final_scores = sorted(final_scores.items(),
                                     key=operator.itemgetter(1),
                                     reverse=1)

        # selecting top final clusters, the number of selecting is num_of_final_clusters = 10

        self.top_final_clusters = []
        self.top_final_phrases = {}
        n = min(number_of_clusters, len(self.final_clusters))
        self.n_goodclusters = 0
        for cluster in range(n):
            self.top_final_clusters.append(
                self.final_clusters[sorted_final_scores[cluster][0]])
            if sorted_final_scores[cluster][1] > 0:
                self.n_goodclusters += 1
            self.top_final_phrases[cluster + 1] = self.final_phrases[
                sorted_final_scores[cluster][0]]

        return self.get_clusters()

    def get_common_phrases(self, num=2):
        def restemming(word, num_snippets):
            for num_snippet in num_snippets:
                tokenized_snippet = tokenize_and_stem(
                    self.snippets[num_snippet], stem=0)
                for sn in tokenized_snippet:
                    if sn.find(word) != -1:
                        return sn
            return ''

        phrases = {}
        for i in range(len(self.get_clusters().keys())):
            for phrase in self.top_final_phrases[i + 1]:
                if i + 1 not in phrases:
                    phrases[i + 1] = []
                words = phrase.split(' ')
                for word in words:
                    restem = restemming(word, self.get_clusters()[i + 1])
                    if restem != '':
                        if len(phrases[i + 1]) < num:
                            phrases[i + 1].append(restem)
        return phrases

    def print_common_phrases(self, num=2):

        result = self.get_common_phrases(num=num)
        for cluster, phrases in result.items():
            print("cluster #%i tags: " % cluster, end=' ')
            print(phrases)

    def get_number_of_good_clusters(self):
        return self.n_goodclusters

    def print_clusters(self):
        result = self.get_clusters()
        for cluster, snippets in result.items():
            print("cluster #%i contains documents: " % cluster, end=' ')
            print(snippets)

    def get_clusters(self):
        result = {}
        count = 1
        for cluster in self.top_final_clusters:
            documents = []
            for base_cluster in cluster:
                documents.append(set(self.cluster_document[base_cluster]))
            result[count] = list(frozenset().union(*documents))
            count += 1

        return result