Esempio n. 1
0
    def execute(self, data_path):

        assert data_path is not None

        self.logger.info(
            '--------------------------------------------------------------------------------'
        )
        self.logger.info('Preparing data for client...')
        self.logger.info('    data_path = %s', data_path)

        self.logger.info('Connecting to data...')
        self.model = ModelAPI(data_path)
        self.saliency = SaliencyAPI(data_path)
        self.seriation = SeriationAPI(data_path)
        self.client = ClientAPI(data_path)

        self.logger.info('Reading data from disk...')
        self.model.read()
        self.saliency.read()
        self.seriation.read()

        self.logger.info('Preparing parameters for seriated matrix...')
        self.prepareSeriatedParameters()

        self.logger.info('Preparing parameters for filtered matrix...')
        self.prepareFilteredParameters()

        self.logger.info('Preparing global term freqs...')
        self.prepareGlobalTermFreqs()

        self.logger.info('Writing data to disk...')
        self.client.write()
    def execute(self, data_path):

        assert data_path is not None

        self.logger.info(
            '--------------------------------------------------------------------------------'
        )
        self.logger.info('Computing term saliency...')
        self.logger.info('    data_path = %s', data_path)

        self.logger.info('Connecting to data...')
        self.model = ModelAPI(data_path)
        self.saliency = SaliencyAPI(data_path)

        self.logger.info('Reading data from disk...')
        self.model.read()

        self.logger.info('Computing...')
        self.computeTopicInfo()
        self.computeTermInfo()
        self.rankResults()

        self.logger.info('Writing data to disk...')
        self.saliency.write()

        self.logger.info(
            '--------------------------------------------------------------------------------'
        )
Esempio n. 3
0
    def execute(self, model_library, model_path, data_path):

        assert model_library is not None
        assert model_library == 'stmt'
        assert model_path is not None
        assert data_path is not None

        self.logger.info(
            '--------------------------------------------------------------------------------'
        )
        self.logger.info('Importing an STMT model...')
        self.logger.info('    topic model = %s (%s)', model_path,
                         model_library)
        self.logger.info('    output = %s', data_path)

        self.logger.info('Connecting to data...')
        self.model = ModelAPI(data_path)

        self.logger.info('Reading "%s" from STMT output...',
                         ImportStmt.TERM_INDEX)
        self.model.term_index = self.readAsList(model_path,
                                                ImportStmt.TERM_INDEX)
        self.model.term_count = len(self.model.term_index)

        self.logger.info('Reading "%s" from STMT output...',
                         ImportStmt.TOPIC_INDEX)
        self.model.topic_index = self.readAsList(model_path,
                                                 ImportStmt.TOPIC_INDEX)
        self.model.topic_count = len(self.model.topic_index)

        self.logger.info('Reading "%s" from STMT output...',
                         ImportStmt.DOCUMENT_INDEX)
        self.model.document_index = self.readAsList(model_path,
                                                    ImportStmt.DOCUMENT_INDEX)
        self.model.document_count = len(self.model.document_index)

        self.logger.info('Reading "%s" from STMT output...',
                         ImportStmt.TOPIC_TERM)
        self.topic_term_counts = self.readCsvAsMatrixStr(
            model_path, ImportStmt.TOPIC_TERM)

        self.logger.info('Reading "%s" from STMT output...',
                         ImportStmt.DOCUMENT_TOPIC)
        self.document_topic_counts = self.readCsvAsMatrixStr(
            model_path, ImportStmt.DOCUMENT_TOPIC)

        self.logger.info('Extracting term-topic matrix...')
        self.extractTermTopicMatrix()

        self.logger.info('Extracting document-topic matrix...')
        self.extractDocumentTopicMatrix()

        self.logger.info('Writing data to disk...')
        self.model.write()
	def execute( self, data_path ):
		
		assert data_path is not None
		
		self.logger.info( '--------------------------------------------------------------------------------' )
		self.logger.info( 'Preparing data for client...'                                                     )
		self.logger.info( '    data_path = %s', data_path                                                    )
		
		self.logger.info( 'Connecting to data...' )
		self.model = ModelAPI( data_path )
		self.saliency = SaliencyAPI( data_path )
		self.seriation = SeriationAPI( data_path )
		self.client = ClientAPI( data_path )
		
		self.logger.info( 'Reading data from disk...' )
		self.model.read()
		self.saliency.read()
		self.seriation.read()
		
		self.logger.info( 'Merging term information...' )
		self.mergeTermInfo()
		
		self.logger.info( 'Extracting term-topic submatrix...' )
		self.extractTermTopicSubmatrix()
		
		self.logger.info( 'Writing data to disk...' )
		self.client.write()
    def execute(self, data_path):

        assert data_path is not None

        self.logger.info("--------------------------------------------------------------------------------")
        self.logger.info("Preparing data for client...")
        self.logger.info("    data_path = %s", data_path)

        self.logger.info("Connecting to data...")
        self.model = ModelAPI(data_path)
        self.saliency = SaliencyAPI(data_path)
        self.seriation = SeriationAPI(data_path)
        self.client = ClientAPI(data_path)

        self.logger.info("Reading data from disk...")
        self.model.read()
        self.saliency.read()
        self.seriation.read()

        self.logger.info("Preparing parameters for seriated matrix...")
        self.prepareSeriatedParameters()

        self.logger.info("Preparing parameters for filtered matrix...")
        self.prepareFilteredParameters()

        self.logger.info("Preparing global term freqs...")
        self.prepareGlobalTermFreqs()

        self.logger.info("Writing data to disk...")
        self.client.write()
Esempio n. 6
0
	def execute( self, model_library, model_path, data_path ):
		
		assert model_library is not None
		assert model_library == 'mallet'
		assert model_path is not None
		assert data_path is not None
		
		self.logger.info( '--------------------------------------------------------------------------------' )
		self.logger.info( 'Importing a Mallet model...'                                                      )
		self.logger.info( '    topic model = %s (%s)', model_path, model_library                             )
		self.logger.info( '    output = %s', data_path                                                       )
		
		self.logger.info( 'Connecting to data...' )
		self.model = ModelAPI( data_path )
		
		self.logger.info( 'Reading "%s" from Mallet...', ImportMallet.TOPIC_WORD_WEIGHTS )
		self.extractTopicWordWeights( model_path )
		
		self.logger.info( 'Writing data to disk...' )
		self.model.write()
		
		self.logger.info( '--------------------------------------------------------------------------------' )
Esempio n. 7
0
	def execute( self, model_library, model_path, data_path ):
		
		assert model_library is not None
		assert model_library == 'stmt'
		assert model_path is not None
		assert data_path is not None
		
		self.logger.info( '--------------------------------------------------------------------------------' )
		self.logger.info( 'Importing an STMT model...'                                                       )
		self.logger.info( '    topic model = %s (%s)', model_path, model_library                             )
		self.logger.info( '    output = %s', data_path                                                       )
		
		self.logger.info( 'Connecting to data...' )
		self.model = ModelAPI( data_path )
		
		self.logger.info( 'Reading "%s" from STMT output...', ImportStmt.TERM_INDEX )
		self.model.term_index  = self.readAsList( model_path, ImportStmt.TERM_INDEX )
		self.model.term_count = len(self.model.term_index)
		
		self.logger.info( 'Reading "%s" from STMT output...', ImportStmt.TOPIC_INDEX )
		self.model.topic_index = self.readAsList( model_path, ImportStmt.TOPIC_INDEX )
		self.model.topic_count = len(self.model.topic_index)
		
		self.logger.info( 'Reading "%s" from STMT output...', ImportStmt.DOCUMENT_INDEX )
		self.model.document_index = self.readAsList( model_path, ImportStmt.DOCUMENT_INDEX )
		self.model.document_count = len(self.model.document_index)
		
		self.logger.info( 'Reading "%s" from STMT output...', ImportStmt.TOPIC_TERM )
		self.topic_term_counts = self.readCsvAsMatrixStr( model_path, ImportStmt.TOPIC_TERM )
		
		self.logger.info( 'Reading "%s" from STMT output...', ImportStmt.DOCUMENT_TOPIC )
		self.document_topic_counts = self.readCsvAsMatrixStr( model_path, ImportStmt.DOCUMENT_TOPIC )
		
		self.logger.info( 'Extracting term-topic matrix...' )
		self.extractTermTopicMatrix()
		
		self.logger.info( 'Extracting document-topic matrix...' )
		self.extractDocumentTopicMatrix()
		
		self.logger.info( 'Writing data to disk...' )
		self.model.write()
Esempio n. 8
0
	def execute( self, data_path ):
		
		assert data_path is not None
		
		self.logger.info( '--------------------------------------------------------------------------------' )
		self.logger.info( 'Computing term saliency...'                                                       )
		self.logger.info( '    data_path = %s', data_path                                                    )
		
		self.logger.info( 'Connecting to data...' )
		self.model = ModelAPI( data_path )
		self.saliency = SaliencyAPI( data_path )
		
		self.logger.info( 'Reading data from disk...' )
		self.model.read()
		
		self.logger.info( 'Computing...' )
		self.computeTopicInfo()
		self.computeTermInfo()
		self.rankResults()
		
		self.logger.info( 'Writing data to disk...' )
		self.saliency.write()
		
		self.logger.info( '--------------------------------------------------------------------------------' )
Esempio n. 9
0
class ImportStmt( object ):
	
	"""
	Copies STMT file formats into Termite internal format.
	"""
	
	# Files generated by STMT
	TERM_INDEX = 'term-index.txt'
	TOPIC_INDEX = 'topic-index.txt'
	DOCUMENT_INDEX = 'doc-index.txt'
	TOPIC_TERM = 'topic-term-distributions.csv'
	DOCUMENT_TOPIC = 'document-topic-distributions.csv'
	
	def __init__( self, logging_level ):
		self.logger = logging.getLogger( 'ImportStmt' )
		self.logger.setLevel( logging_level )
		handler = logging.StreamHandler( sys.stderr )
		handler.setLevel( logging_level )
		self.logger.addHandler( handler )
	
	def execute( self, model_library, model_path, data_path ):
		
		assert model_library is not None
		assert model_library == 'stmt'
		assert model_path is not None
		assert data_path is not None
		
		self.logger.info( '--------------------------------------------------------------------------------' )
		self.logger.info( 'Importing an STMT model...'                                                       )
		self.logger.info( '    topic model = %s (%s)', model_path, model_library                             )
		self.logger.info( '    output = %s', data_path                                                       )
		
		self.logger.info( 'Connecting to data...' )
		self.model = ModelAPI( data_path )
		
		self.logger.info( 'Reading "%s" from STMT output...', ImportStmt.TERM_INDEX )
		self.model.term_index  = self.readAsList( model_path, ImportStmt.TERM_INDEX )
		self.model.term_count = len(self.model.term_index)
		
		self.logger.info( 'Reading "%s" from STMT output...', ImportStmt.TOPIC_INDEX )
		self.model.topic_index = self.readAsList( model_path, ImportStmt.TOPIC_INDEX )
		self.model.topic_count = len(self.model.topic_index)
		
		self.logger.info( 'Reading "%s" from STMT output...', ImportStmt.DOCUMENT_INDEX )
		self.model.document_index = self.readAsList( model_path, ImportStmt.DOCUMENT_INDEX )
		self.model.document_count = len(self.model.document_index)
		
		self.logger.info( 'Reading "%s" from STMT output...', ImportStmt.TOPIC_TERM )
		self.topic_term_counts = self.readCsvAsMatrixStr( model_path, ImportStmt.TOPIC_TERM )
		
		self.logger.info( 'Reading "%s" from STMT output...', ImportStmt.DOCUMENT_TOPIC )
		self.document_topic_counts = self.readCsvAsMatrixStr( model_path, ImportStmt.DOCUMENT_TOPIC )
		
		self.logger.info( 'Extracting term-topic matrix...' )
		self.extractTermTopicMatrix()
		
		self.logger.info( 'Extracting document-topic matrix...' )
		self.extractDocumentTopicMatrix()
		
		self.logger.info( 'Writing data to disk...' )
		self.model.write()
	
	def readAsList( self, model_path, filename ):
		data = []
		filename = '{}/{}'.format( model_path, filename )
		with open( filename, 'r' ) as f:
			data = f.read().decode( 'utf-8' ).splitlines()
		return data
	
	# Need for STMT, which generates a mixed-string-float document-topic-distributions.csv file
	def readCsvAsMatrixStr( self, model_path, filename ):
		"""
		Return a matrix (list of list) of string values.
		Each row corresponds to a line of the input file.
		Each cell (in a row) corresponds to a comma-separated value (in each line).
		"""
		data = []
		filename = '{}/{}'.format( model_path, filename )
		with open( filename, 'r' ) as f:
			lines = UnicodeReader( f, delimiter = ',' )
			data = [ d for d in lines ]
		return data
	
	def extractDocumentTopicMatrix( self ):
		"""
		Extract document-topic matrix.
		Probability distributions are stored from the 2nd column onward in the document-topic distributions.
		"""
		matrix = []
		for line in self.document_topic_counts:
			matrix.append( map( float, line[1:self.model.topic_count+1] ) )
		self.model.document_topic_matrix = matrix
	
	def extractTermTopicMatrix( self ):
		"""
		Extract term-topic matrix.
		Transpose the input topic-term distributions.
		Ensure all values are greater than or equal to 0.
		"""
		matrix = []
		for i in range(self.model.term_count):
			matrix.append( [ max(0, float(x[i])) for x in self.topic_term_counts] )
		self.model.term_topic_matrix = matrix
Esempio n. 10
0
class ImportMallet( object ):

	"""
	Copies mallet file formats into Termite internal format.
	"""
	
	# Files generated by Mallet
	TOPIC_WORD_WEIGHTS = 'topic-word-weights.txt'
	
	def __init__( self, logging_level ):
		self.logger = logging.getLogger( 'ImportMallet' )
		self.logger.setLevel( logging_level )
		handler = logging.StreamHandler( sys.stderr )
		handler.setLevel( logging_level )
		self.logger.addHandler( handler )
	
	def execute( self, model_library, model_path, data_path ):
		
		assert model_library is not None
		assert model_library == 'mallet'
		assert model_path is not None
		assert data_path is not None
		
		self.logger.info( '--------------------------------------------------------------------------------' )
		self.logger.info( 'Importing a Mallet model...'                                                      )
		self.logger.info( '    topic model = %s (%s)', model_path, model_library                             )
		self.logger.info( '    output = %s', data_path                                                       )
		
		self.logger.info( 'Connecting to data...' )
		self.model = ModelAPI( data_path )
		
		self.logger.info( 'Reading "%s" from Mallet...', ImportMallet.TOPIC_WORD_WEIGHTS )
		self.extractTopicWordWeights( model_path )
		
		self.logger.info( 'Writing data to disk...' )
		self.model.write()
		
		self.logger.info( '--------------------------------------------------------------------------------' )
	
	def extractTopicWordWeights( self, model_path ):
		data = {}
		words = []
		topics = []
		
		# Read in content of file (sparse matrix representation)
		filename = '{}/{}'.format( model_path, ImportMallet.TOPIC_WORD_WEIGHTS )
		with open( filename, 'r' ) as f:
			lines = UnicodeReader( f )
			for (topic, word, value) in lines:
				topic = int(topic)
				if topic not in data:
					data[ topic ] = {}
				data[ topic ][ word ] = float(value)
				words.append( word )
				topics.append( topic )
		
		# Get list of terms and topic indexes
		term_index = sorted( list( frozenset( words ) ) )
		topic_index = sorted( list( frozenset( topics ) ) )
		
		# Build dense matrix representation
		matrix = []
		for term in term_index :
			row = []
			for topic in topic_index :
				row.append( data[ topic ][ term ] )
			matrix.append( row )
		
		# Generate topic labels
		topic_str_index = [ 'Topic {}'.format(d) for d in topic_index ]
		
		self.model.term_topic_matrix = matrix
		self.model.term_index = term_index
		self.model.topic_index = topic_str_index
Esempio n. 11
0
class ImportStmt(object):
    """
	Copies STMT file formats into Termite internal format.
	"""

    # Files generated by STMT
    TERM_INDEX = 'term-index.txt'
    TOPIC_INDEX = 'topic-index.txt'
    DOCUMENT_INDEX = 'doc-index.txt'
    TOPIC_TERM = 'topic-term-distributions.csv'
    DOCUMENT_TOPIC = 'document-topic-distributions.csv'

    def __init__(self, logging_level):
        self.logger = logging.getLogger('ImportStmt')
        self.logger.setLevel(logging_level)
        handler = logging.StreamHandler(sys.stderr)
        handler.setLevel(logging_level)
        self.logger.addHandler(handler)

    def execute(self, model_library, model_path, data_path):

        assert model_library is not None
        assert model_library == 'stmt'
        assert model_path is not None
        assert data_path is not None

        self.logger.info(
            '--------------------------------------------------------------------------------'
        )
        self.logger.info('Importing an STMT model...')
        self.logger.info('    topic model = %s (%s)', model_path,
                         model_library)
        self.logger.info('    output = %s', data_path)

        self.logger.info('Connecting to data...')
        self.model = ModelAPI(data_path)

        self.logger.info('Reading "%s" from STMT output...',
                         ImportStmt.TERM_INDEX)
        self.model.term_index = self.readAsList(model_path,
                                                ImportStmt.TERM_INDEX)
        self.model.term_count = len(self.model.term_index)

        self.logger.info('Reading "%s" from STMT output...',
                         ImportStmt.TOPIC_INDEX)
        self.model.topic_index = self.readAsList(model_path,
                                                 ImportStmt.TOPIC_INDEX)
        self.model.topic_count = len(self.model.topic_index)

        self.logger.info('Reading "%s" from STMT output...',
                         ImportStmt.DOCUMENT_INDEX)
        self.model.document_index = self.readAsList(model_path,
                                                    ImportStmt.DOCUMENT_INDEX)
        self.model.document_count = len(self.model.document_index)

        self.logger.info('Reading "%s" from STMT output...',
                         ImportStmt.TOPIC_TERM)
        self.topic_term_counts = self.readCsvAsMatrixStr(
            model_path, ImportStmt.TOPIC_TERM)

        self.logger.info('Reading "%s" from STMT output...',
                         ImportStmt.DOCUMENT_TOPIC)
        self.document_topic_counts = self.readCsvAsMatrixStr(
            model_path, ImportStmt.DOCUMENT_TOPIC)

        self.logger.info('Extracting term-topic matrix...')
        self.extractTermTopicMatrix()

        self.logger.info('Extracting document-topic matrix...')
        self.extractDocumentTopicMatrix()

        self.logger.info('Writing data to disk...')
        self.model.write()

    def readAsList(self, model_path, filename):
        data = []
        filename = '{}/{}'.format(model_path, filename)
        with open(filename, 'r') as f:
            data = f.read().decode('utf-8').splitlines()
        return data

    # Need for STMT, which generates a mixed-string-float document-topic-distributions.csv file
    def readCsvAsMatrixStr(self, model_path, filename):
        """
		Return a matrix (list of list) of string values.
		Each row corresponds to a line of the input file.
		Each cell (in a row) corresponds to a comma-separated value (in each line).
		"""
        data = []
        filename = '{}/{}'.format(model_path, filename)
        with open(filename, 'r') as f:
            lines = UnicodeReader(f, delimiter=',')
            data = [d for d in lines]
        return data

    def extractDocumentTopicMatrix(self):
        """
		Extract document-topic matrix.
		Probability distributions are stored from the 2nd column onward in the document-topic distributions.
		"""
        matrix = []
        for line in self.document_topic_counts:
            matrix.append(map(float, line[1:self.model.topic_count + 1]))
        self.model.document_topic_matrix = matrix

    def extractTermTopicMatrix(self):
        """
		Extract term-topic matrix.
		Transpose the input topic-term distributions.
		Ensure all values are greater than or equal to 0.
		"""
        matrix = [[0] * self.model.topic_count] * self.model.term_count
        for j, line in enumerate(self.topic_term_counts):
            for i, value in enumerate(line):
                matrix[i][j] = max(0, float(value))
        self.model.term_topic_matrix = matrix
Esempio n. 12
0
class PrepareDataForClient(object):
    """
	Reformats data necessary for client to run. 
	
	Extracts a subset of the complete term list and term-topic matrix and writes
	the subset to a separate file. Also, generates JSON file that merges/packages term
	information with the actual term.
	
	Input is term-topic probability distribution and term information, stored in 4 files:
	    'term-topic-matrix.txt' contains the entries of the matrix.
	    'term-index.txt' contains the terms corresponding to the rows of the matrix.
	    'topic-index.txt' contains the topic labels corresponding to the columns of the matrix.
	    'term-info.txt' contains information about individual terms.
	
	Output is a subset of terms and matrix, as well as the term subset's information.
	Number of files created or copied: 5
		'submatrix-term-index.txt'
	    'submatrix-topic-index.txt'
	    'submatrix-term-topic.txt'
	    'term-info.json'
	    'term-info.txt'
	"""
    def __init__(self, logging_level):
        self.logger = logging.getLogger('PrepareDataForClient')
        self.logger.setLevel(logging_level)
        handler = logging.StreamHandler(sys.stderr)
        handler.setLevel(logging_level)
        self.logger.addHandler(handler)

    def execute(self, data_path):

        assert data_path is not None

        self.logger.info(
            '--------------------------------------------------------------------------------'
        )
        self.logger.info('Preparing data for client...')
        self.logger.info('    data_path = %s', data_path)

        self.logger.info('Connecting to data...')
        self.model = ModelAPI(data_path)
        self.saliency = SaliencyAPI(data_path)
        self.seriation = SeriationAPI(data_path)
        self.client = ClientAPI(data_path)

        self.logger.info('Reading data from disk...')
        self.model.read()
        self.saliency.read()
        self.seriation.read()

        self.logger.info('Preparing parameters for seriated matrix...')
        self.prepareSeriatedParameters()

        self.logger.info('Preparing parameters for filtered matrix...')
        self.prepareFilteredParameters()

        self.logger.info('Preparing global term freqs...')
        self.prepareGlobalTermFreqs()

        self.logger.info('Writing data to disk...')
        self.client.write()

    def prepareSeriatedParameters(self):
        topic_index = self.model.topic_index
        term_index = self.model.term_index
        term_topic_matrix = self.model.term_topic_matrix
        term_ordering = self.seriation.term_ordering
        term_topic_submatrix = []
        term_subindex = []
        for term in term_ordering:
            if term in term_index:
                index = term_index.index(term)
                term_topic_submatrix.append(term_topic_matrix[index])
                term_subindex.append(term)
            else:
                self.logger.info(
                    'ERROR: Term (%s) does not appear in the list of seriated terms',
                    term)

        self.client.seriated_parameters = {
            'termIndex': term_subindex,
            'topicIndex': topic_index,
            'matrix': term_topic_submatrix
        }

    def prepareFilteredParameters(self):
        term_rank_map = {
            term: value
            for value, term in enumerate(self.seriation.term_iter_index)
        }
        term_order_map = {
            term: value
            for value, term in enumerate(self.seriation.term_ordering)
        }
        term_saliency_map = {
            d['term']: d['saliency']
            for d in self.saliency.term_info
        }
        term_distinctiveness_map = {
            d['term']: d['distinctiveness']
            for d in self.saliency.term_info
        }

        self.client.filtered_parameters = {
            'termRankMap': term_rank_map,
            'termOrderMap': term_order_map,
            'termSaliencyMap': term_saliency_map,
            'termDistinctivenessMap': term_distinctiveness_map
        }

    def prepareGlobalTermFreqs(self):
        topic_index = self.model.topic_index
        term_index = self.model.term_index
        term_topic_matrix = self.model.term_topic_matrix
        term_ordering = self.seriation.term_ordering
        term_topic_submatrix = []
        term_subindex = []
        for term in term_ordering:
            if term in term_index:
                index = term_index.index(term)
                term_topic_submatrix.append(term_topic_matrix[index])
                term_subindex.append(term)
            else:
                self.logger.info(
                    'ERROR: Term (%s) does not appear in the list of seriated terms',
                    term)

        term_freqs = {
            d['term']: d['frequency']
            for d in self.saliency.term_info
        }

        self.client.global_term_freqs = {
            'termIndex': term_subindex,
            'topicIndex': topic_index,
            'matrix': term_topic_submatrix,
            'termFreqMap': term_freqs
        }
class ComputeSaliency(object):
    """
	Distinctiveness and saliency.
	
	Compute term distinctiveness and term saliency, based on
	the term probability distributions associated with a set of
	latent topics.
	
	Input is term-topic probability distribution, stored in 3 separate files:
	    'term-topic-matrix.txt' contains the entries of the matrix.
	    'term-index.txt' contains the terms corresponding to the rows of the matrix.
	    'topic-index.txt' contains the topic labels corresponding to the columns of the matrix.
	
	Output is a list of term distinctiveness and saliency values,
	in two duplicate formats, a tab-delimited file and a JSON object:
	    'term-info.txt'
	    'term-info.json'
	
	An auxiliary output is a list topic weights (i.e., the number of
	tokens in the corpus assigned to each latent topic) in two
	duplicate formats, a tab-delimited file and a JSON object:
	    'topic-info.txt'
	    'topic-info.json'
	"""
    def __init__(self, logging_level):
        self.logger = logging.getLogger('ComputeSaliency')
        self.logger.setLevel(logging_level)
        handler = logging.StreamHandler(sys.stderr)
        handler.setLevel(logging_level)
        self.logger.addHandler(handler)

    def execute(self, data_path):

        assert data_path is not None

        self.logger.info(
            '--------------------------------------------------------------------------------'
        )
        self.logger.info('Computing term saliency...')
        self.logger.info('    data_path = %s', data_path)

        self.logger.info('Connecting to data...')
        self.model = ModelAPI(data_path)
        self.saliency = SaliencyAPI(data_path)

        self.logger.info('Reading data from disk...')
        self.model.read()

        self.logger.info('Computing...')
        self.computeTopicInfo()
        self.computeTermInfo()
        self.rankResults()

        self.logger.info('Writing data to disk...')
        self.saliency.write()

        self.logger.info(
            '--------------------------------------------------------------------------------'
        )

    def computeTopicInfo(self):
        topic_weights = [sum(x) for x in zip(*self.model.term_topic_matrix)]
        topic_info = []
        for i in range(self.model.topic_count):
            topic_info.append({
                'topic': self.model.topic_index[i],
                'weight': topic_weights[i]
            })

        self.saliency.topic_info = topic_info

    def computeTermInfo(self):
        """Iterate over the list of terms. Compute frequency, distinctiveness, saliency."""

        topic_marginal = self.getNormalized(
            [d['weight'] for d in self.saliency.topic_info])
        term_info = []
        for i in range(self.model.term_count):
            term = self.model.term_index[i]
            counts = self.model.term_topic_matrix[i]
            frequency = sum(counts)
            probs = self.getNormalized(counts)
            distinctiveness = self.getKLDivergence(probs, topic_marginal)
            saliency = frequency * distinctiveness
            term_info.append({
                'term': term,
                'saliency': saliency,
                'frequency': frequency,
                'distinctiveness': distinctiveness,
                'rank': None,
                'visibility': 'default'
            })
        self.saliency.term_info = term_info

    def getNormalized(self, counts):
        """Rescale a list of counts, so they represent a proper probability distribution."""
        tally = sum(counts)
        if tally == 0:
            probs = [d for d in counts]
        else:
            probs = [d / tally for d in counts]
        return probs

    def getKLDivergence(self, P, Q):
        """Compute KL-divergence from P to Q"""
        divergence = 0
        assert len(P) == len(Q)
        for i in range(len(P)):
            p = P[i]
            q = Q[i]
            assert p >= 0
            assert q >= 0
            if p > 0:
                divergence += p * math.log(p / q)
        return divergence

    def rankResults(self):
        """Sort topics by decreasing weight. Sort term frequencies by decreasing saliency."""
        self.saliency.topic_info = sorted(
            self.saliency.topic_info,
            key=lambda topic_weight: -topic_weight['weight'])
        self.saliency.term_info = sorted(
            self.saliency.term_info,
            key=lambda term_freq: -term_freq['saliency'])
        for i, element in enumerate(self.saliency.term_info):
            element['rank'] = i
class PrepareDataForClient( object ):
	"""
	Reformats data necessary for client to run. 
	
	Extracts a subset of the complete term list and term-topic matrix and writes
	the subset to a separate file. Also, generates JSON file that merges/packages term
	information with the actual term.
	
	Input is term-topic probability distribution and term information, stored in 4 files:
	    'term-topic-matrix.txt' contains the entries of the matrix.
	    'term-index.txt' contains the terms corresponding to the rows of the matrix.
	    'topic-index.txt' contains the topic labels corresponding to the columns of the matrix.
	    'term-info.txt' contains information about individual terms.
	
	Output is a subset of terms and matrix, as well as the term subset's information.
	Number of files created or copied: 5
		'submatrix-term-index.txt'
	    'submatrix-topic-index.txt'
	    'submatrix-term-topic.txt'
	    'term-info.json'
	    'term-info.txt'
	"""
	
	def __init__( self, logging_level ):
		self.logger = logging.getLogger( 'PrepareDataForClient' )
		self.logger.setLevel( logging_level )
		handler = logging.StreamHandler( sys.stderr )
		handler.setLevel( logging_level )
		self.logger.addHandler( handler )
	
	def execute( self, data_path ):
		
		assert data_path is not None
		
		self.logger.info( '--------------------------------------------------------------------------------' )
		self.logger.info( 'Preparing data for client...'                                                     )
		self.logger.info( '    data_path = %s', data_path                                                    )
		
		self.logger.info( 'Connecting to data...' )
		self.model = ModelAPI( data_path )
		self.saliency = SaliencyAPI( data_path )
		self.seriation = SeriationAPI( data_path )
		self.client = ClientAPI( data_path )
		
		self.logger.info( 'Reading data from disk...' )
		self.model.read()
		self.saliency.read()
		self.seriation.read()
		
		self.logger.info( 'Merging term information...' )
		self.mergeTermInfo()
		
		self.logger.info( 'Extracting term-topic submatrix...' )
		self.extractTermTopicSubmatrix()
		
		self.logger.info( 'Writing data to disk...' )
		self.client.write()
	
	def mergeTermInfo( self ):
		# Build lookup tables
		term_orderings = { term: value for value, term in enumerate( self.seriation.term_ordering ) }
		term_iter_indexes = { term: value for value, term in enumerate( self.seriation.term_iter_index ) }
		term_freqs = { d['term']: d['frequency'] for d in self.saliency.term_info }
		term_saliencies = { d['term']: d['saliency'] for d in self.saliency.term_info }
		
		# Merge into a single object
		term_info = []
		for term in term_orderings:
			ordering = term_orderings[ term ]
			ranking = term_iter_indexes[ term ]
			frequency = term_freqs[ term ]
			saliency = term_saliencies[ term ]
			term_info.append( {
				"term" : term,
				"ranking" : ranking,
				"ordering" : ordering,
				"frequency" : frequency,
				"saliency" : saliency
			} )
			
		# Write to client API
		self.client.term_info = term_info
			
	def extractTermTopicSubmatrix( self ):
		topic_index = self.model.topic_index
		term_index = self.model.term_index
		term_topic_matrix = self.model.term_topic_matrix
		term_ordering = self.seriation.term_ordering
		
		term_topic_submatrix = []
		term_subindex = []
		for term in term_ordering:
			if term in term_index:
				index = term_index.index( term )
				term_topic_submatrix.append( term_topic_matrix[ index ] )
				term_subindex.append( term )
			else:
				self.logger.info( 'ERROR: Term (%s) does not appear in the list of seriated terms', term )
		
		self.client.term_index = term_subindex
		self.client.topic_index = topic_index
		self.client.term_topic_matrix = term_topic_submatrix
Esempio n. 15
0
class ComputeSaliency( object ):
	"""
	Distinctiveness and saliency.
	
	Compute term distinctiveness and term saliency, based on
	the term probability distributions associated with a set of
	latent topics.
	
	Input is term-topic probability distribution, stored in 3 separate files:
	    'term-topic-matrix.txt' contains the entries of the matrix.
	    'term-index.txt' contains the terms corresponding to the rows of the matrix.
	    'topic-index.txt' contains the topic labels corresponding to the columns of the matrix.
	
	Output is a list of term distinctiveness and saliency values,
	in two duplicate formats, a tab-delimited file and a JSON object:
	    'term-info.txt'
	    'term-info.json'
	
	An auxiliary output is a list topic weights (i.e., the number of
	tokens in the corpus assigned to each latent topic) in two
	duplicate formats, a tab-delimited file and a JSON object:
	    'topic-info.txt'
	    'topic-info.json'
	"""
	
	def __init__( self, logging_level ):
		self.logger = logging.getLogger( 'ComputeSaliency' )
		self.logger.setLevel( logging_level )
		handler = logging.StreamHandler( sys.stderr )
		handler.setLevel( logging_level )
		self.logger.addHandler( handler )
	
	def execute( self, data_path ):
		
		assert data_path is not None
		
		self.logger.info( '--------------------------------------------------------------------------------' )
		self.logger.info( 'Computing term saliency...'                                                       )
		self.logger.info( '    data_path = %s', data_path                                                    )
		
		self.logger.info( 'Connecting to data...' )
		self.model = ModelAPI( data_path )
		self.saliency = SaliencyAPI( data_path )
		
		self.logger.info( 'Reading data from disk...' )
		self.model.read()
		
		self.logger.info( 'Computing...' )
		self.computeTopicInfo()
		self.computeTermInfo()
		self.rankResults()
		
		self.logger.info( 'Writing data to disk...' )
		self.saliency.write()
		
		self.logger.info( '--------------------------------------------------------------------------------' )
	
	def computeTopicInfo( self ):
		topic_weights = [ sum(x) for x in zip( *self.model.term_topic_matrix ) ]
		topic_info = []
		for i in range(self.model.topic_count):
			topic_info.append( {
				'topic' : self.model.topic_index[i],
				'weight' : topic_weights[i]
			} )
		
		self.saliency.topic_info = topic_info
	
	def computeTermInfo( self ):
		"""Iterate over the list of terms. Compute frequency, distinctiveness, saliency."""
		
		topic_marginal = self.getNormalized( [ d['weight'] for d in self.saliency.topic_info ] )
		term_info = []
		for i in range(self.model.term_count):
			term = self.model.term_index[i]
			counts = self.model.term_topic_matrix[i]
			frequency = sum( counts )
			probs = self.getNormalized( counts )
			distinctiveness = self.getKLDivergence( probs, topic_marginal )
			saliency = frequency * distinctiveness
			term_info.append( {
				'term' : term,
				'saliency' : saliency,
				'frequency' : frequency,
				'distinctiveness' : distinctiveness,
				'rank' : None,
				'visibility' : 'default'
			} )
		self.saliency.term_info = term_info
	
	def getNormalized( self, counts ):
		"""Rescale a list of counts, so they represent a proper probability distribution."""
		tally = sum( counts )
		if tally == 0:
			probs = [ d for d in counts ]
		else:
			probs = [ d / tally for d in counts ]
		return probs
	
	def getKLDivergence( self, P, Q ):
		"""Compute KL-divergence from P to Q"""
		divergence = 0
		assert len(P) == len(Q)
		for i in range(len(P)):
			p = P[i]
			q = Q[i]
			assert p >= 0
			assert q >= 0
			if p > 0:
				divergence += p * math.log( p / q )
		return divergence
	
	def rankResults( self ):
		"""Sort topics by decreasing weight. Sort term frequencies by decreasing saliency."""
		self.saliency.topic_info = sorted( self.saliency.topic_info, key = lambda topic_weight : -topic_weight['weight'] )
		self.saliency.term_info = sorted( self.saliency.term_info, key = lambda term_freq : -term_freq['saliency'] )
		for i, element in enumerate( self.saliency.term_info ):
			element['rank'] = i
Esempio n. 16
0
class PrepareDataForClient(object):
    """
	Reformats data necessary for client to run. 
	
	Extracts a subset of the complete term list and term-topic matrix and writes
	the subset to a separate file. Also, generates JSON file that merges/packages term
	information with the actual term.
	
	Input is term-topic probability distribution and term information, stored in 4 files:
	    'term-topic-matrix.txt' contains the entries of the matrix.
	    'term-index.txt' contains the terms corresponding to the rows of the matrix.
	    'topic-index.txt' contains the topic labels corresponding to the columns of the matrix.
	    'term-info.txt' contains information about individual terms.
	
	Output is a subset of terms and matrix, as well as the term subset's information.
	Number of files created or copied: 5
		'submatrix-term-index.txt'
	    'submatrix-topic-index.txt'
	    'submatrix-term-topic.txt'
	    'term-info.json'
	    'term-info.txt'
	"""

    def __init__(self, logging_level):
        self.logger = logging.getLogger("PrepareDataForClient")
        self.logger.setLevel(logging_level)
        handler = logging.StreamHandler(sys.stderr)
        handler.setLevel(logging_level)
        self.logger.addHandler(handler)

    def execute(self, data_path):

        assert data_path is not None

        self.logger.info("--------------------------------------------------------------------------------")
        self.logger.info("Preparing data for client...")
        self.logger.info("    data_path = %s", data_path)

        self.logger.info("Connecting to data...")
        self.model = ModelAPI(data_path)
        self.saliency = SaliencyAPI(data_path)
        self.seriation = SeriationAPI(data_path)
        self.client = ClientAPI(data_path)

        self.logger.info("Reading data from disk...")
        self.model.read()
        self.saliency.read()
        self.seriation.read()

        self.logger.info("Preparing parameters for seriated matrix...")
        self.prepareSeriatedParameters()

        self.logger.info("Preparing parameters for filtered matrix...")
        self.prepareFilteredParameters()

        self.logger.info("Preparing global term freqs...")
        self.prepareGlobalTermFreqs()

        self.logger.info("Writing data to disk...")
        self.client.write()

    def prepareSeriatedParameters(self):
        topic_index = self.model.topic_index
        term_index = self.model.term_index
        term_topic_matrix = self.model.term_topic_matrix
        term_ordering = self.seriation.term_ordering
        term_topic_submatrix = []
        term_subindex = []
        for term in term_ordering:
            if term in term_index:
                index = term_index.index(term)
                term_topic_submatrix.append(term_topic_matrix[index])
                term_subindex.append(term)
            else:
                self.logger.info("ERROR: Term (%s) does not appear in the list of seriated terms", term)

        self.client.seriated_parameters = {
            "termIndex": term_subindex,
            "topicIndex": topic_index,
            "matrix": term_topic_submatrix,
        }

    def prepareFilteredParameters(self):
        term_rank_map = {term: value for value, term in enumerate(self.seriation.term_iter_index)}
        term_order_map = {term: value for value, term in enumerate(self.seriation.term_ordering)}
        term_saliency_map = {d["term"]: d["saliency"] for d in self.saliency.term_info}
        term_distinctiveness_map = {d["term"]: d["distinctiveness"] for d in self.saliency.term_info}

        self.client.filtered_parameters = {
            "termRankMap": term_rank_map,
            "termOrderMap": term_order_map,
            "termSaliencyMap": term_saliency_map,
            "termDistinctivenessMap": term_distinctiveness_map,
        }

    def prepareGlobalTermFreqs(self):
        topic_index = self.model.topic_index
        term_index = self.model.term_index
        term_topic_matrix = self.model.term_topic_matrix
        term_ordering = self.seriation.term_ordering
        term_topic_submatrix = []
        term_subindex = []
        for term in term_ordering:
            if term in term_index:
                index = term_index.index(term)
                term_topic_submatrix.append(term_topic_matrix[index])
                term_subindex.append(term)
            else:
                self.logger.info("ERROR: Term (%s) does not appear in the list of seriated terms", term)

        term_freqs = {d["term"]: d["frequency"] for d in self.saliency.term_info}

        self.client.global_term_freqs = {
            "termIndex": term_subindex,
            "topicIndex": topic_index,
            "matrix": term_topic_submatrix,
            "termFreqMap": term_freqs,
        }