Example #1
0
class LexRank:
	"""
	Constructs a summary of the input document by extracting most informative sentences. 

	Arguments:
		directory - A corpus of text files to be summarized. 
	"""
	def __init__(self, directory):
		self.graph = TextGraph(directory)



	def lexR(self, graph):
		"""
		Compute the LexRank of the sentences. 
		LexRank of a sentence in the sentence graph is the PageRank of the node 
		representing the sentence. It is a measure of the importance and influence 
		of the sentence in the corpus. 

		Arguments:
			graph -  A networkx graph or digraph. 

		Returns:
			A dictionary of all the nodes with their PageRank scores. 
		"""

		pr = nx.pagerank_numpy(graph, alpha=0.85)
		return pr




	def summary(self, compression = 0.25):
		"""
		Builds the summary based on the LexRank scores of the sentences.

		Arguments:
			compression : A number in [0,1] which is equal to the fraction of total 
			sentences to be included in the summary. 
			Default value is 0.25

		Returns:
			Summary of the input document.	 
		"""
		g = self.graph.sentenceGraph()
		total_sentences = len(g.nodes())
		n_sentences = int(total_sentences * compression)

		rankings = self.lexR(g)
		ranked_sentences = sorted(rankings.iteritems(), key=operator.itemgetter(1), reverse=True)

		summary_sentences = ""
		i = 0
		while (i < n_sentences):
			u,v = ranked_sentences[i]
			summary_sentences += u
			i = i + 1
		return summary_sentences	
Example #2
0
class LexRank:
    """
	Constructs a summary of the input document by extracting most informative sentences. 

	Arguments:
		directory - A corpus of text files to be summarized. 
	"""
    def __init__(self, directory):
        self.graph = TextGraph(directory)

    def lexR(self, graph):
        """
		Compute the LexRank of the sentences. 
		LexRank of a sentence in the sentence graph is the PageRank of the node 
		representing the sentence. It is a measure of the importance and influence 
		of the sentence in the corpus. 

		Arguments:
			graph -  A networkx graph or digraph. 

		Returns:
			A dictionary of all the nodes with their PageRank scores. 
		"""

        pr = nx.pagerank_numpy(graph, alpha=0.85)
        return pr

    def summary(self, compression=0.25):
        """
		Builds the summary based on the LexRank scores of the sentences.

		Arguments:
			compression : A number in [0,1] which is equal to the fraction of total 
			sentences to be included in the summary. 
			Default value is 0.25

		Returns:
			Summary of the input document.	 
		"""
        g = self.graph.sentenceGraph()
        total_sentences = len(g.nodes())
        n_sentences = int(total_sentences * compression)

        rankings = self.lexR(g)
        ranked_sentences = sorted(rankings.iteritems(),
                                  key=operator.itemgetter(1),
                                  reverse=True)

        summary_sentences = ""
        i = 0
        while (i < n_sentences):
            u, v = ranked_sentences[i]
            summary_sentences += u
            i = i + 1
        return summary_sentences
Example #3
0
 def __init__(self, directory, cooccuranceThrehsold=1):
     self.corpus = directory
     self.model = TextGraph(directory)
     self.graph = self.model.keywordGraph(cooccuranceThrehsold)
     self.docFreqDict = self.model.docFreq()
Example #4
0
class KeyGraph:
    """
	Implements the KeyGraph from the keywordGraph. 
	KeyGraph is a slight modification of the keywordGraph as defined in the paper ...
	KeyGraph is used in extracting topics for the corpus of the documents.

	Arguments:
		directory : a corpus of text files. 
		cooccuranceThrehsold : If the cooccurances of two keywords is above
			coocuranceThreshold, there is an edge between the nodes represented
			by the keywords. 
			Default value is 1
	"""

    def __init__(self, directory, cooccuranceThrehsold=1):
        self.corpus = directory
        self.model = TextGraph(directory)
        self.graph = self.model.keywordGraph(cooccuranceThrehsold)
        self.docFreqDict = self.model.docFreq()

    def probCoocurance(self, keyword1, keyword2):
        """
		Compute the probability of cooourance of a keyword in a document
		given a given keyword occurs in the document.

		Arguments:
			keyword1 : first keyword. 
			keyword2 : second keyword.

		Returns:
			Numerical value of the probability.	 
		"""
        documentFrequencyDict = self.docFreqDict
        docsContainingKeyword1 = set(documentFrequencyDict[keyword1])
        docsContainingKeyword2 = set(documentFrequencyDict[keyword2])

        commonDocs = len(docsContainingKeyword1 & docsContainingKeyword2)
        prob = commonDocs / len(docsContainingKeyword1)

        return prob

    def network(self, docFreqThreshold=2, probThreshold=0.01):
        """
		Compute the KeyGraph. 	

		Arguments:
			docFreqThreshold : If the documnet frequency of a keyword is below
								the docFreqThreshold, then the node corresponding
								to the keyword is removed from the graph. 
								Default value is 1.
		"""
        g = self.graph
        allNodes = g.nodes()
        allEdges = g.edges()
        documentFrequency = self.docFreqDict

        for x in allNodes:
            if len(documentFrequency[x]) < docFreqThreshold:
                g.remove_node(x)

        for e in allEdges:
            node1 = e[0]
            node2 = e[1]
            if (self.probCoocurance(node1, node2) < probThreshold) or (
                self.probCoocurance(node2, node1) < probThreshold
            ):
                g.remove_edge(node1, node2)
        return g
Example #5
0
from TextGraphics.src.graph import TextGraph
from TextGraphics.Analysis.plotting import Pictures
from TextGraphics.Analysis.properties import Analysis
from TextGraphics.Applications.summary import LexRank


directory = 'Data'
g = TextGraph(directory)
senGraph = g.sentenceGraph()
node = senGraph.nodes()[0]

out = Pictures(senGraph)
out.graphPlot(0.3, labelingByNumbers=True)

out1 = Analysis(senGraph)
l = out1.nodeInConnectedComponent(node)
print l.nodes()

cS = out1.centralNodes(4)
print cS

lR = LexRank(directory)
print lR.summary()
 def __init__(self, directory, cooccuranceThrehsold=1):
     self.corpus = directory
     self.model = TextGraph(directory)
     self.graph = self.model.keywordGraph(cooccuranceThrehsold)
     self.docFreqDict = self.model.docFreq()
class KeyGraph:
    """
	Implements the KeyGraph from the keywordGraph. 
	KeyGraph is a slight modification of the keywordGraph as defined in the paper ...
	KeyGraph is used in extracting topics for the corpus of the documents.

	Arguments:
		directory : a corpus of text files. 
		cooccuranceThrehsold : If the cooccurances of two keywords is above
			coocuranceThreshold, there is an edge between the nodes represented
			by the keywords. 
			Default value is 1
	"""
    def __init__(self, directory, cooccuranceThrehsold=1):
        self.corpus = directory
        self.model = TextGraph(directory)
        self.graph = self.model.keywordGraph(cooccuranceThrehsold)
        self.docFreqDict = self.model.docFreq()

    def probCoocurance(self, keyword1, keyword2):
        """
		Compute the probability of cooourance of a keyword in a document
		given a given keyword occurs in the document.

		Arguments:
			keyword1 : first keyword. 
			keyword2 : second keyword.

		Returns:
			Numerical value of the probability.	 
		"""
        documentFrequencyDict = self.docFreqDict
        docsContainingKeyword1 = set(documentFrequencyDict[keyword1])
        docsContainingKeyword2 = set(documentFrequencyDict[keyword2])

        commonDocs = len(docsContainingKeyword1 & docsContainingKeyword2)
        prob = commonDocs / len(docsContainingKeyword1)

        return prob

    def network(self, docFreqThreshold=2, probThreshold=0.01):
        """
		Compute the KeyGraph. 	

		Arguments:
			docFreqThreshold : If the documnet frequency of a keyword is below
								the docFreqThreshold, then the node corresponding
								to the keyword is removed from the graph. 
								Default value is 1.
		"""
        g = self.graph
        allNodes = g.nodes()
        allEdges = g.edges()
        documentFrequency = self.docFreqDict

        for x in allNodes:
            if (len(documentFrequency[x]) < docFreqThreshold):
                g.remove_node(x)

        for e in allEdges:
            node1 = e[0]
            node2 = e[1]
            if (self.probCoocurance(node1, node2) < probThreshold) or (
                    self.probCoocurance(node2, node1) < probThreshold):
                g.remove_edge(node1, node2)
        return g
from TextGraphics.src.graph import TextGraph
from TextGraphics.Analysis.plotting import Pictures
from TextGraphics.Analysis.properties import Analysis
from TextGraphics.Applications.summary import LexRank

##  Import the corpus of text files.
directory = 'Data'

##  Create the sentence graph
g = TextGraph(directory)
senGraph = g.sentenceGraph()
node = senGraph.nodes()[0]

##  Plot the sentence graph
out = Pictures(senGraph)
out.graphPlot(0.3, labelingByNumbers=True)

# Compute the keyword graph
kwgraph = g.keywordGraph()
print len(kwgraph.nodes())

##  Plot the keyword graph
out = Pictures(kwgraph)
out.graphPlot(0.3, labelingByNumbers=True)

#### Analysis
out1 = Analysis(senGraph)

# Find the connected component of a node.
l = out1.nodeInConnectedComponent(node)
print l.nodes()
Example #9
0
 def __init__(self, directory):
     self.graph = TextGraph(directory)
Example #10
0
	def __init__(self, directory):
		self.graph = TextGraph(directory)
Example #11
0
from TextGraphics.src.graph import TextGraph
from TextGraphics.Analysis.plotting import Pictures
from TextGraphics.Analysis.properties import Analysis
from TextGraphics.Applications.summary import LexRank


##  Import the corpus of text files.
directory = "Data"

##  Create the sentence graph
g = TextGraph(directory)
senGraph = g.sentenceGraph()
node = senGraph.nodes()[0]

##  Plot the sentence graph
out = Pictures(senGraph)
out.graphPlot(0.3, labelingByNumbers=True)

# Compute the keyword graph
kwgraph = g.keywordGraph()
print len(kwgraph.nodes())

##  Plot the keyword graph
out = Pictures(kwgraph)
out.graphPlot(0.3, labelingByNumbers=True)


#### Analysis
out1 = Analysis(senGraph)

# Find the connected component of a node.