class TextRankRanker(RankerC): """ Component performing candidate terms ranking based on the TextRank score of their words. """ def __init__(self, name, is_lazy, lazy_directory, debug, strategy, scoring_function, ordering_criteria=ORDERING_CRITERIA.POSITION): """ Constructor of the component. @param name: The name of the component. @type name: C{string} @param is_lazy: True if the component must load previous data, False if data must be computed tought they have already been computed. @type is_lazy: C{bool} @param lazy_directory: The directory used to store previously computed data. @type lazy_directory: C{string} @param debug: True if the component is in debug mode, else False. When the component is in debug mode, it will output each step of its processing. @type debug: C{bool} @param strategy: The strategy used to specialized the graph construction and usage. @type strategy: C{TextRankStrategy} @param scoring_function: Function used to compute the scores of the textual units, when the give candidates to weight are not single words. @type scoring_function: C{function(expression, word_weights): float} @param ordering_criteria: The criteria to use to order the cluster. - Position: the first appearing candidate, in the document, is ranked first. - Frequency: the most frequent candidate, in the document, is ranked first. - Centroid: The centroid of the cluster is ranked first. @type ordering_criteria: C{ORDERING_CRITERIA} """ super(TextRankRanker, self).__init__(name, is_lazy, lazy_directory, debug) self._strategy = strategy self._textrank = TextRank(strategy, scoring_function, 0.0001, 0.85, 1000000) self._ordering_criteria = ordering_criteria def weighting(self, pre_processed_file, candidates, clusters): """ Takes a pre-processed text (list of POS-tagged sentences) and gives a weight to its candidates keyphrases. @param pre_processed_file: The pre-processed file. @type pre_processed_file: C{PreProcessedFile} @param candidates: The keyphrase candidates. @type candidates: C{list(string)} @param clusters: The clustered candidates. @type clusters: C{list(list(string))} @return: A dictionary of terms as key and weight as value. @rtype: C{dict(string, float)} """ # sheat to reset clusters for TopicRank if isinstance(self._textrank.strategy(), TopicRankStrategy): self._strategy.set_clusters(clusters) ranking = self._textrank.rank(candidates, pre_processed_file.full_text()) weighted_candidates = {} for candidate, score in ranking: weighted_candidates[candidate] = score return weighted_candidates def ordering(self, weights, clusters): """ Takes the weighted terms of the analysed text and ordered them. @param weights: A dictionary of weighted candidates. @type weights: C{dict(string, float)} @param clusters: The clustered candidates. @type clusters: C{list(list(string))} @return: A ordered list of weighted terms. @rtype: C{list(tuple(string, float))} """ ordered_terms = [] if not isinstance(self._textrank.strategy(), TopicRankStrategy): ordered_terms = sorted(weights.items(), key=lambda row: row[1], reverse=True) else: clusters = self._textrank.strategy().token_ids().values() # extraction the best candidate term per cluster for cluster in clusters: # ordering and untagging of the termes of the clusters untagged_cluster = [] for term in cluster: untagged_term = "" for wt in term.split(): w = wt.rsplit( self._textrank.strategy().tag_separator(), 1)[0] if untagged_term != "": untagged_term += " " untagged_term += w untagged_cluster.append(untagged_term) untagged_cluster = self.cluster_ordering(untagged_cluster) # adding the best keyphrase of the cluster cluster_keyphrase = untagged_cluster[0] cluster_score = weights[untagged_cluster[0]] ordered_terms.append((cluster_keyphrase, cluster_score)) ordered_terms = sorted(ordered_terms, key=lambda (t, s): (s), reverse=True) return ordered_terms def cluster_ordering(self, cluster): """ Orders the elements of a cluster, based on a given criteria. @param cluster: The cluster to re-order. @type cluster: C{list(string)} @return: The re-ordered cluster. @rtype: C{list{string}} """ text = self._textrank.strategy().context() sentence_length_accumulator = 0 first_positions = {} frequency = {} ##### centroid calculation ################################################# fake_pos_tagged_cluster = [] for term in cluster: tagged = "" for w in term.split(): if tagged != "": tagged += " " tagged += w + self._textrank.strategy().tag_separator() + "fk" fake_pos_tagged_cluster.append(tagged) tagged_centroid = cluster_centroid( fake_pos_tagged_cluster, self._textrank.strategy().tag_separator(), self._textrank.strategy().stemmer()) centroid = "" for i, term in enumerate(fake_pos_tagged_cluster): if term == tagged_centroid: centroid = cluster[i] ##### first position and frequency calculation ############################# for sentence in text: untagged_sentence = "" for wt in sentence.split(): w = wt.rsplit(self._textrank.strategy().tag_separator(), 1)[0] if untagged_sentence != "": untagged_sentence += " " untagged_sentence += w for term in cluster: pos = untagged_sentence.find(term) if pos >= 0: if not first_positions.has_key(term): first_positions[term] = sentence_length_accumulator + ( pos + 1) if not frequency.has_key(term): frequency[term] = 0.0 frequency[term] += 1.0 sentence_length_accumulator += len(untagged_sentence) if self._ordering_criteria == ORDERING_CRITERIA.POSITION: return sorted(cluster, key=lambda (t): (first_positions[t], -1 * len(t.split()))) else: if self._ordering_criteria == ORDERING_CRITERIA.FREQUENCY: return sorted(cluster, key=lambda (t): (frequency[t], -1 * len(t.split()))) else: return sorted(cluster, key=lambda (t): (t != centroid, -1 * len(t.split())))
class TextRankRanker(RankerC): """ Component performing candidate terms ranking based on the TextRank score of their words. """ def __init__(self, name, is_lazy, lazy_directory, debug, strategy, scoring_function, ordering_criteria=ORDERING_CRITERIA.POSITION): """ Constructor of the component. @param name: The name of the component. @type name: C{string} @param is_lazy: True if the component must load previous data, False if data must be computed tought they have already been computed. @type is_lazy: C{bool} @param lazy_directory: The directory used to store previously computed data. @type lazy_directory: C{string} @param debug: True if the component is in debug mode, else False. When the component is in debug mode, it will output each step of its processing. @type debug: C{bool} @param strategy: The strategy used to specialized the graph construction and usage. @type strategy: C{TextRankStrategy} @param scoring_function: Function used to compute the scores of the textual units, when the give candidates to weight are not single words. @type scoring_function: C{function(expression, word_weights): float} @param ordering_criteria: The criteria to use to order the cluster. - Position: the first appearing candidate, in the document, is ranked first. - Frequency: the most frequent candidate, in the document, is ranked first. - Centroid: The centroid of the cluster is ranked first. @type ordering_criteria: C{ORDERING_CRITERIA} """ super(TextRankRanker, self).__init__(name, is_lazy, lazy_directory, debug) self._strategy = strategy self._textrank = TextRank(strategy, scoring_function, 0.0001, 0.85, 1000000) self._ordering_criteria = ordering_criteria def weighting(self, pre_processed_file, candidates, clusters): """ Takes a pre-processed text (list of POS-tagged sentences) and gives a weight to its candidates keyphrases. @param pre_processed_file: The pre-processed file. @type pre_processed_file: C{PreProcessedFile} @param candidates: The keyphrase candidates. @type candidates: C{list(string)} @param clusters: The clustered candidates. @type clusters: C{list(list(string))} @return: A dictionary of terms as key and weight as value. @rtype: C{dict(string, float)} """ # sheat to reset clusters for TopicRank if isinstance(self._textrank.strategy(), TopicRankStrategy): self._strategy.set_clusters(clusters) ranking = self._textrank.rank(candidates, pre_processed_file.full_text()) weighted_candidates = {} for candidate, score in ranking: weighted_candidates[candidate] = score return weighted_candidates def ordering(self, weights, clusters): """ Takes the weighted terms of the analysed text and ordered them. @param weights: A dictionary of weighted candidates. @type weights: C{dict(string, float)} @param clusters: The clustered candidates. @type clusters: C{list(list(string))} @return: A ordered list of weighted terms. @rtype: C{list(tuple(string, float))} """ ordered_terms = [] if not isinstance(self._textrank.strategy(), TopicRankStrategy): ordered_terms = sorted(weights.items(), key=lambda row: row[1], reverse=True) else: clusters = self._textrank.strategy().token_ids().values() # extraction the best candidate term per cluster for cluster in clusters: # ordering and untagging of the termes of the clusters untagged_cluster = [] for term in cluster: untagged_term = "" for wt in term.split(): w = wt.rsplit(self._textrank.strategy().tag_separator(), 1)[0] if untagged_term != "": untagged_term += " " untagged_term += w untagged_cluster.append(untagged_term) untagged_cluster = self.cluster_ordering(untagged_cluster) # adding the best keyphrase of the cluster cluster_keyphrase = untagged_cluster[0] cluster_score = weights[untagged_cluster[0]] ordered_terms.append((cluster_keyphrase, cluster_score)) ordered_terms = sorted(ordered_terms, key=lambda (t, s): (s), reverse=True) return ordered_terms def cluster_ordering(self, cluster): """ Orders the elements of a cluster, based on a given criteria. @param cluster: The cluster to re-order. @type cluster: C{list(string)} @return: The re-ordered cluster. @rtype: C{list{string}} """ text = self._textrank.strategy().context() sentence_length_accumulator = 0 first_positions = {} frequency = {} ##### centroid calculation ################################################# fake_pos_tagged_cluster = [] for term in cluster: tagged = "" for w in term.split(): if tagged != "": tagged += " " tagged += w + self._textrank.strategy().tag_separator() + "fk" fake_pos_tagged_cluster.append(tagged) tagged_centroid = cluster_centroid(fake_pos_tagged_cluster, self._textrank.strategy().tag_separator(), self._textrank.strategy().stemmer()) centroid = "" for i, term in enumerate(fake_pos_tagged_cluster): if term == tagged_centroid: centroid = cluster[i] ##### first position and frequency calculation ############################# for sentence in text: untagged_sentence = "" for wt in sentence.split(): w = wt.rsplit(self._textrank.strategy().tag_separator(), 1)[0] if untagged_sentence != "": untagged_sentence += " " untagged_sentence += w for term in cluster: pos = untagged_sentence.find(term) if pos >= 0: if not first_positions.has_key(term): first_positions[term] = sentence_length_accumulator + (pos + 1) if not frequency.has_key(term): frequency[term] = 0.0 frequency[term] += 1.0 sentence_length_accumulator += len(untagged_sentence) if self._ordering_criteria == ORDERING_CRITERIA.POSITION: return sorted(cluster, key=lambda (t): (first_positions[t], -1 * len(t.split()))) else: if self._ordering_criteria == ORDERING_CRITERIA.FREQUENCY: return sorted(cluster, key=lambda (t): (frequency[t], -1 * len(t.split()))) else: return sorted(cluster, key=lambda (t): (t != centroid, -1 * len(t.split())))