class LexRank: """ Constructs a summary of the input document by extracting most informative sentences. Arguments: directory - A corpus of text files to be summarized. """ def __init__(self, directory): self.graph = TextGraph(directory) def lexR(self, graph): """ Compute the LexRank of the sentences. LexRank of a sentence in the sentence graph is the PageRank of the node representing the sentence. It is a measure of the importance and influence of the sentence in the corpus. Arguments: graph - A networkx graph or digraph. Returns: A dictionary of all the nodes with their PageRank scores. """ pr = nx.pagerank_numpy(graph, alpha=0.85) return pr def summary(self, compression = 0.25): """ Builds the summary based on the LexRank scores of the sentences. Arguments: compression : A number in [0,1] which is equal to the fraction of total sentences to be included in the summary. Default value is 0.25 Returns: Summary of the input document. """ g = self.graph.sentenceGraph() total_sentences = len(g.nodes()) n_sentences = int(total_sentences * compression) rankings = self.lexR(g) ranked_sentences = sorted(rankings.iteritems(), key=operator.itemgetter(1), reverse=True) summary_sentences = "" i = 0 while (i < n_sentences): u,v = ranked_sentences[i] summary_sentences += u i = i + 1 return summary_sentences
class LexRank: """ Constructs a summary of the input document by extracting most informative sentences. Arguments: directory - A corpus of text files to be summarized. """ def __init__(self, directory): self.graph = TextGraph(directory) def lexR(self, graph): """ Compute the LexRank of the sentences. LexRank of a sentence in the sentence graph is the PageRank of the node representing the sentence. It is a measure of the importance and influence of the sentence in the corpus. Arguments: graph - A networkx graph or digraph. Returns: A dictionary of all the nodes with their PageRank scores. """ pr = nx.pagerank_numpy(graph, alpha=0.85) return pr def summary(self, compression=0.25): """ Builds the summary based on the LexRank scores of the sentences. Arguments: compression : A number in [0,1] which is equal to the fraction of total sentences to be included in the summary. Default value is 0.25 Returns: Summary of the input document. """ g = self.graph.sentenceGraph() total_sentences = len(g.nodes()) n_sentences = int(total_sentences * compression) rankings = self.lexR(g) ranked_sentences = sorted(rankings.iteritems(), key=operator.itemgetter(1), reverse=True) summary_sentences = "" i = 0 while (i < n_sentences): u, v = ranked_sentences[i] summary_sentences += u i = i + 1 return summary_sentences
def __init__(self, directory, cooccuranceThrehsold=1): self.corpus = directory self.model = TextGraph(directory) self.graph = self.model.keywordGraph(cooccuranceThrehsold) self.docFreqDict = self.model.docFreq()
class KeyGraph: """ Implements the KeyGraph from the keywordGraph. KeyGraph is a slight modification of the keywordGraph as defined in the paper ... KeyGraph is used in extracting topics for the corpus of the documents. Arguments: directory : a corpus of text files. cooccuranceThrehsold : If the cooccurances of two keywords is above coocuranceThreshold, there is an edge between the nodes represented by the keywords. Default value is 1 """ def __init__(self, directory, cooccuranceThrehsold=1): self.corpus = directory self.model = TextGraph(directory) self.graph = self.model.keywordGraph(cooccuranceThrehsold) self.docFreqDict = self.model.docFreq() def probCoocurance(self, keyword1, keyword2): """ Compute the probability of cooourance of a keyword in a document given a given keyword occurs in the document. Arguments: keyword1 : first keyword. keyword2 : second keyword. Returns: Numerical value of the probability. """ documentFrequencyDict = self.docFreqDict docsContainingKeyword1 = set(documentFrequencyDict[keyword1]) docsContainingKeyword2 = set(documentFrequencyDict[keyword2]) commonDocs = len(docsContainingKeyword1 & docsContainingKeyword2) prob = commonDocs / len(docsContainingKeyword1) return prob def network(self, docFreqThreshold=2, probThreshold=0.01): """ Compute the KeyGraph. Arguments: docFreqThreshold : If the documnet frequency of a keyword is below the docFreqThreshold, then the node corresponding to the keyword is removed from the graph. Default value is 1. """ g = self.graph allNodes = g.nodes() allEdges = g.edges() documentFrequency = self.docFreqDict for x in allNodes: if len(documentFrequency[x]) < docFreqThreshold: g.remove_node(x) for e in allEdges: node1 = e[0] node2 = e[1] if (self.probCoocurance(node1, node2) < probThreshold) or ( self.probCoocurance(node2, node1) < probThreshold ): g.remove_edge(node1, node2) return g
from TextGraphics.src.graph import TextGraph from TextGraphics.Analysis.plotting import Pictures from TextGraphics.Analysis.properties import Analysis from TextGraphics.Applications.summary import LexRank directory = 'Data' g = TextGraph(directory) senGraph = g.sentenceGraph() node = senGraph.nodes()[0] out = Pictures(senGraph) out.graphPlot(0.3, labelingByNumbers=True) out1 = Analysis(senGraph) l = out1.nodeInConnectedComponent(node) print l.nodes() cS = out1.centralNodes(4) print cS lR = LexRank(directory) print lR.summary()
class KeyGraph: """ Implements the KeyGraph from the keywordGraph. KeyGraph is a slight modification of the keywordGraph as defined in the paper ... KeyGraph is used in extracting topics for the corpus of the documents. Arguments: directory : a corpus of text files. cooccuranceThrehsold : If the cooccurances of two keywords is above coocuranceThreshold, there is an edge between the nodes represented by the keywords. Default value is 1 """ def __init__(self, directory, cooccuranceThrehsold=1): self.corpus = directory self.model = TextGraph(directory) self.graph = self.model.keywordGraph(cooccuranceThrehsold) self.docFreqDict = self.model.docFreq() def probCoocurance(self, keyword1, keyword2): """ Compute the probability of cooourance of a keyword in a document given a given keyword occurs in the document. Arguments: keyword1 : first keyword. keyword2 : second keyword. Returns: Numerical value of the probability. """ documentFrequencyDict = self.docFreqDict docsContainingKeyword1 = set(documentFrequencyDict[keyword1]) docsContainingKeyword2 = set(documentFrequencyDict[keyword2]) commonDocs = len(docsContainingKeyword1 & docsContainingKeyword2) prob = commonDocs / len(docsContainingKeyword1) return prob def network(self, docFreqThreshold=2, probThreshold=0.01): """ Compute the KeyGraph. Arguments: docFreqThreshold : If the documnet frequency of a keyword is below the docFreqThreshold, then the node corresponding to the keyword is removed from the graph. Default value is 1. """ g = self.graph allNodes = g.nodes() allEdges = g.edges() documentFrequency = self.docFreqDict for x in allNodes: if (len(documentFrequency[x]) < docFreqThreshold): g.remove_node(x) for e in allEdges: node1 = e[0] node2 = e[1] if (self.probCoocurance(node1, node2) < probThreshold) or ( self.probCoocurance(node2, node1) < probThreshold): g.remove_edge(node1, node2) return g
from TextGraphics.src.graph import TextGraph from TextGraphics.Analysis.plotting import Pictures from TextGraphics.Analysis.properties import Analysis from TextGraphics.Applications.summary import LexRank ## Import the corpus of text files. directory = 'Data' ## Create the sentence graph g = TextGraph(directory) senGraph = g.sentenceGraph() node = senGraph.nodes()[0] ## Plot the sentence graph out = Pictures(senGraph) out.graphPlot(0.3, labelingByNumbers=True) # Compute the keyword graph kwgraph = g.keywordGraph() print len(kwgraph.nodes()) ## Plot the keyword graph out = Pictures(kwgraph) out.graphPlot(0.3, labelingByNumbers=True) #### Analysis out1 = Analysis(senGraph) # Find the connected component of a node. l = out1.nodeInConnectedComponent(node) print l.nodes()
def __init__(self, directory): self.graph = TextGraph(directory)
from TextGraphics.src.graph import TextGraph from TextGraphics.Analysis.plotting import Pictures from TextGraphics.Analysis.properties import Analysis from TextGraphics.Applications.summary import LexRank ## Import the corpus of text files. directory = "Data" ## Create the sentence graph g = TextGraph(directory) senGraph = g.sentenceGraph() node = senGraph.nodes()[0] ## Plot the sentence graph out = Pictures(senGraph) out.graphPlot(0.3, labelingByNumbers=True) # Compute the keyword graph kwgraph = g.keywordGraph() print len(kwgraph.nodes()) ## Plot the keyword graph out = Pictures(kwgraph) out.graphPlot(0.3, labelingByNumbers=True) #### Analysis out1 = Analysis(senGraph) # Find the connected component of a node.