Ejemplo n.º 1
0
class KeyGraph:
    """
	Implements the KeyGraph from the keywordGraph. 
	KeyGraph is a slight modification of the keywordGraph as defined in the paper ...
	KeyGraph is used in extracting topics for the corpus of the documents.

	Arguments:
		directory : a corpus of text files. 
		cooccuranceThrehsold : If the cooccurances of two keywords is above
			coocuranceThreshold, there is an edge between the nodes represented
			by the keywords. 
			Default value is 1
	"""

    def __init__(self, directory, cooccuranceThrehsold=1):
        self.corpus = directory
        self.model = TextGraph(directory)
        self.graph = self.model.keywordGraph(cooccuranceThrehsold)
        self.docFreqDict = self.model.docFreq()

    def probCoocurance(self, keyword1, keyword2):
        """
		Compute the probability of cooourance of a keyword in a document
		given a given keyword occurs in the document.

		Arguments:
			keyword1 : first keyword. 
			keyword2 : second keyword.

		Returns:
			Numerical value of the probability.	 
		"""
        documentFrequencyDict = self.docFreqDict
        docsContainingKeyword1 = set(documentFrequencyDict[keyword1])
        docsContainingKeyword2 = set(documentFrequencyDict[keyword2])

        commonDocs = len(docsContainingKeyword1 & docsContainingKeyword2)
        prob = commonDocs / len(docsContainingKeyword1)

        return prob

    def network(self, docFreqThreshold=2, probThreshold=0.01):
        """
		Compute the KeyGraph. 	

		Arguments:
			docFreqThreshold : If the documnet frequency of a keyword is below
								the docFreqThreshold, then the node corresponding
								to the keyword is removed from the graph. 
								Default value is 1.
		"""
        g = self.graph
        allNodes = g.nodes()
        allEdges = g.edges()
        documentFrequency = self.docFreqDict

        for x in allNodes:
            if len(documentFrequency[x]) < docFreqThreshold:
                g.remove_node(x)

        for e in allEdges:
            node1 = e[0]
            node2 = e[1]
            if (self.probCoocurance(node1, node2) < probThreshold) or (
                self.probCoocurance(node2, node1) < probThreshold
            ):
                g.remove_edge(node1, node2)
        return g
from TextGraphics.Applications.summary import LexRank

##  Import the corpus of text files.
directory = 'Data'

##  Create the sentence graph
g = TextGraph(directory)
senGraph = g.sentenceGraph()
node = senGraph.nodes()[0]

##  Plot the sentence graph
out = Pictures(senGraph)
out.graphPlot(0.3, labelingByNumbers=True)

# Compute the keyword graph
kwgraph = g.keywordGraph()
print len(kwgraph.nodes())

##  Plot the keyword graph
out = Pictures(kwgraph)
out.graphPlot(0.3, labelingByNumbers=True)

#### Analysis
out1 = Analysis(senGraph)

# Find the connected component of a node.
l = out1.nodeInConnectedComponent(node)
print l.nodes()

# Print 4 most central nodes.
cS = out1.centralNodes(4)
class KeyGraph:
    """
	Implements the KeyGraph from the keywordGraph. 
	KeyGraph is a slight modification of the keywordGraph as defined in the paper ...
	KeyGraph is used in extracting topics for the corpus of the documents.

	Arguments:
		directory : a corpus of text files. 
		cooccuranceThrehsold : If the cooccurances of two keywords is above
			coocuranceThreshold, there is an edge between the nodes represented
			by the keywords. 
			Default value is 1
	"""
    def __init__(self, directory, cooccuranceThrehsold=1):
        self.corpus = directory
        self.model = TextGraph(directory)
        self.graph = self.model.keywordGraph(cooccuranceThrehsold)
        self.docFreqDict = self.model.docFreq()

    def probCoocurance(self, keyword1, keyword2):
        """
		Compute the probability of cooourance of a keyword in a document
		given a given keyword occurs in the document.

		Arguments:
			keyword1 : first keyword. 
			keyword2 : second keyword.

		Returns:
			Numerical value of the probability.	 
		"""
        documentFrequencyDict = self.docFreqDict
        docsContainingKeyword1 = set(documentFrequencyDict[keyword1])
        docsContainingKeyword2 = set(documentFrequencyDict[keyword2])

        commonDocs = len(docsContainingKeyword1 & docsContainingKeyword2)
        prob = commonDocs / len(docsContainingKeyword1)

        return prob

    def network(self, docFreqThreshold=2, probThreshold=0.01):
        """
		Compute the KeyGraph. 	

		Arguments:
			docFreqThreshold : If the documnet frequency of a keyword is below
								the docFreqThreshold, then the node corresponding
								to the keyword is removed from the graph. 
								Default value is 1.
		"""
        g = self.graph
        allNodes = g.nodes()
        allEdges = g.edges()
        documentFrequency = self.docFreqDict

        for x in allNodes:
            if (len(documentFrequency[x]) < docFreqThreshold):
                g.remove_node(x)

        for e in allEdges:
            node1 = e[0]
            node2 = e[1]
            if (self.probCoocurance(node1, node2) < probThreshold) or (
                    self.probCoocurance(node2, node1) < probThreshold):
                g.remove_edge(node1, node2)
        return g
Ejemplo n.º 4
0

##  Import the corpus of text files.
directory = "Data"

##  Create the sentence graph
g = TextGraph(directory)
senGraph = g.sentenceGraph()
node = senGraph.nodes()[0]

##  Plot the sentence graph
out = Pictures(senGraph)
out.graphPlot(0.3, labelingByNumbers=True)

# Compute the keyword graph
kwgraph = g.keywordGraph()
print len(kwgraph.nodes())

##  Plot the keyword graph
out = Pictures(kwgraph)
out.graphPlot(0.3, labelingByNumbers=True)


#### Analysis
out1 = Analysis(senGraph)

# Find the connected component of a node.
l = out1.nodeInConnectedComponent(node)
print l.nodes()

# Print 4 most central nodes.