Esempio n. 1
0
class ImportMallet( object ):

	"""
	Copies mallet file formats into Termite internal format.
	"""
	
	# Files generated by Mallet
	TOPIC_WORD_WEIGHTS = 'topic-word-weights.txt'
	
	def __init__( self, logging_level ):
		self.logger = logging.getLogger( 'ImportMallet' )
		self.logger.setLevel( logging_level )
		handler = logging.StreamHandler( sys.stderr )
		handler.setLevel( logging_level )
		self.logger.addHandler( handler )
	
	def execute( self, model_library, model_path, data_path ):
		
		assert model_library is not None
		assert model_library == 'mallet'
		assert model_path is not None
		assert data_path is not None
		
		self.logger.info( '--------------------------------------------------------------------------------' )
		self.logger.info( 'Importing a Mallet model...'                                                      )
		self.logger.info( '    topic model = %s (%s)', model_path, model_library                             )
		self.logger.info( '    output = %s', data_path                                                       )
		
		self.logger.info( 'Connecting to data...' )
		self.model = ModelAPI( data_path )
		
		self.logger.info( 'Reading "%s" from Mallet...', ImportMallet.TOPIC_WORD_WEIGHTS )
		self.extractTopicWordWeights( model_path )
		
		self.logger.info( 'Writing data to disk...' )
		self.model.write()
		
		self.logger.info( '--------------------------------------------------------------------------------' )
	
	def extractTopicWordWeights( self, model_path ):
		data = {}
		words = []
		topics = []
		
		# Read in content of file (sparse matrix representation)
		filename = '{}/{}'.format( model_path, ImportMallet.TOPIC_WORD_WEIGHTS )
		with open( filename, 'r' ) as f:
			lines = UnicodeReader( f )
			for (topic, word, value) in lines:
				topic = int(topic)
				if topic not in data:
					data[ topic ] = {}
				data[ topic ][ word ] = float(value)
				words.append( word )
				topics.append( topic )
		
		# Get list of terms and topic indexes
		term_index = sorted( list( frozenset( words ) ) )
		topic_index = sorted( list( frozenset( topics ) ) )
		
		# Build dense matrix representation
		matrix = []
		for term in term_index :
			row = []
			for topic in topic_index :
				row.append( data[ topic ][ term ] )
			matrix.append( row )
		
		# Generate topic labels
		topic_str_index = [ 'Topic {}'.format(d) for d in topic_index ]
		
		self.model.term_topic_matrix = matrix
		self.model.term_index = term_index
		self.model.topic_index = topic_str_index
Esempio n. 2
0
class ImportStmt( object ):
	
	"""
	Copies STMT file formats into Termite internal format.
	"""
	
	# Files generated by STMT
	TERM_INDEX = 'term-index.txt'
	TOPIC_INDEX = 'topic-index.txt'
	DOCUMENT_INDEX = 'doc-index.txt'
	TOPIC_TERM = 'topic-term-distributions.csv'
	DOCUMENT_TOPIC = 'document-topic-distributions.csv'
	
	def __init__( self, logging_level ):
		self.logger = logging.getLogger( 'ImportStmt' )
		self.logger.setLevel( logging_level )
		handler = logging.StreamHandler( sys.stderr )
		handler.setLevel( logging_level )
		self.logger.addHandler( handler )
	
	def execute( self, model_library, model_path, data_path ):
		
		assert model_library is not None
		assert model_library == 'stmt'
		assert model_path is not None
		assert data_path is not None
		
		self.logger.info( '--------------------------------------------------------------------------------' )
		self.logger.info( 'Importing an STMT model...'                                                       )
		self.logger.info( '    topic model = %s (%s)', model_path, model_library                             )
		self.logger.info( '    output = %s', data_path                                                       )
		
		self.logger.info( 'Connecting to data...' )
		self.model = ModelAPI( data_path )
		
		self.logger.info( 'Reading "%s" from STMT output...', ImportStmt.TERM_INDEX )
		self.model.term_index  = self.readAsList( model_path, ImportStmt.TERM_INDEX )
		self.model.term_count = len(self.model.term_index)
		
		self.logger.info( 'Reading "%s" from STMT output...', ImportStmt.TOPIC_INDEX )
		self.model.topic_index = self.readAsList( model_path, ImportStmt.TOPIC_INDEX )
		self.model.topic_count = len(self.model.topic_index)
		
		self.logger.info( 'Reading "%s" from STMT output...', ImportStmt.DOCUMENT_INDEX )
		self.model.document_index = self.readAsList( model_path, ImportStmt.DOCUMENT_INDEX )
		self.model.document_count = len(self.model.document_index)
		
		self.logger.info( 'Reading "%s" from STMT output...', ImportStmt.TOPIC_TERM )
		self.topic_term_counts = self.readCsvAsMatrixStr( model_path, ImportStmt.TOPIC_TERM )
		
		self.logger.info( 'Reading "%s" from STMT output...', ImportStmt.DOCUMENT_TOPIC )
		self.document_topic_counts = self.readCsvAsMatrixStr( model_path, ImportStmt.DOCUMENT_TOPIC )
		
		self.logger.info( 'Extracting term-topic matrix...' )
		self.extractTermTopicMatrix()
		
		self.logger.info( 'Extracting document-topic matrix...' )
		self.extractDocumentTopicMatrix()
		
		self.logger.info( 'Writing data to disk...' )
		self.model.write()
	
	def readAsList( self, model_path, filename ):
		data = []
		filename = '{}/{}'.format( model_path, filename )
		with open( filename, 'r' ) as f:
			data = f.read().decode( 'utf-8' ).splitlines()
		return data
	
	# Need for STMT, which generates a mixed-string-float document-topic-distributions.csv file
	def readCsvAsMatrixStr( self, model_path, filename ):
		"""
		Return a matrix (list of list) of string values.
		Each row corresponds to a line of the input file.
		Each cell (in a row) corresponds to a comma-separated value (in each line).
		"""
		data = []
		filename = '{}/{}'.format( model_path, filename )
		with open( filename, 'r' ) as f:
			lines = UnicodeReader( f, delimiter = ',' )
			data = [ d for d in lines ]
		return data
	
	def extractDocumentTopicMatrix( self ):
		"""
		Extract document-topic matrix.
		Probability distributions are stored from the 2nd column onward in the document-topic distributions.
		"""
		matrix = []
		for line in self.document_topic_counts:
			matrix.append( map( float, line[1:self.model.topic_count+1] ) )
		self.model.document_topic_matrix = matrix
	
	def extractTermTopicMatrix( self ):
		"""
		Extract term-topic matrix.
		Transpose the input topic-term distributions.
		Ensure all values are greater than or equal to 0.
		"""
		matrix = []
		for i in range(self.model.term_count):
			matrix.append( [ max(0, float(x[i])) for x in self.topic_term_counts] )
		self.model.term_topic_matrix = matrix
Esempio n. 3
0
class ImportStmt(object):
    """
	Copies STMT file formats into Termite internal format.
	"""

    # Files generated by STMT
    TERM_INDEX = 'term-index.txt'
    TOPIC_INDEX = 'topic-index.txt'
    DOCUMENT_INDEX = 'doc-index.txt'
    TOPIC_TERM = 'topic-term-distributions.csv'
    DOCUMENT_TOPIC = 'document-topic-distributions.csv'

    def __init__(self, logging_level):
        self.logger = logging.getLogger('ImportStmt')
        self.logger.setLevel(logging_level)
        handler = logging.StreamHandler(sys.stderr)
        handler.setLevel(logging_level)
        self.logger.addHandler(handler)

    def execute(self, model_library, model_path, data_path):

        assert model_library is not None
        assert model_library == 'stmt'
        assert model_path is not None
        assert data_path is not None

        self.logger.info(
            '--------------------------------------------------------------------------------'
        )
        self.logger.info('Importing an STMT model...')
        self.logger.info('    topic model = %s (%s)', model_path,
                         model_library)
        self.logger.info('    output = %s', data_path)

        self.logger.info('Connecting to data...')
        self.model = ModelAPI(data_path)

        self.logger.info('Reading "%s" from STMT output...',
                         ImportStmt.TERM_INDEX)
        self.model.term_index = self.readAsList(model_path,
                                                ImportStmt.TERM_INDEX)
        self.model.term_count = len(self.model.term_index)

        self.logger.info('Reading "%s" from STMT output...',
                         ImportStmt.TOPIC_INDEX)
        self.model.topic_index = self.readAsList(model_path,
                                                 ImportStmt.TOPIC_INDEX)
        self.model.topic_count = len(self.model.topic_index)

        self.logger.info('Reading "%s" from STMT output...',
                         ImportStmt.DOCUMENT_INDEX)
        self.model.document_index = self.readAsList(model_path,
                                                    ImportStmt.DOCUMENT_INDEX)
        self.model.document_count = len(self.model.document_index)

        self.logger.info('Reading "%s" from STMT output...',
                         ImportStmt.TOPIC_TERM)
        self.topic_term_counts = self.readCsvAsMatrixStr(
            model_path, ImportStmt.TOPIC_TERM)

        self.logger.info('Reading "%s" from STMT output...',
                         ImportStmt.DOCUMENT_TOPIC)
        self.document_topic_counts = self.readCsvAsMatrixStr(
            model_path, ImportStmt.DOCUMENT_TOPIC)

        self.logger.info('Extracting term-topic matrix...')
        self.extractTermTopicMatrix()

        self.logger.info('Extracting document-topic matrix...')
        self.extractDocumentTopicMatrix()

        self.logger.info('Writing data to disk...')
        self.model.write()

    def readAsList(self, model_path, filename):
        data = []
        filename = '{}/{}'.format(model_path, filename)
        with open(filename, 'r') as f:
            data = f.read().decode('utf-8').splitlines()
        return data

    # Need for STMT, which generates a mixed-string-float document-topic-distributions.csv file
    def readCsvAsMatrixStr(self, model_path, filename):
        """
		Return a matrix (list of list) of string values.
		Each row corresponds to a line of the input file.
		Each cell (in a row) corresponds to a comma-separated value (in each line).
		"""
        data = []
        filename = '{}/{}'.format(model_path, filename)
        with open(filename, 'r') as f:
            lines = UnicodeReader(f, delimiter=',')
            data = [d for d in lines]
        return data

    def extractDocumentTopicMatrix(self):
        """
		Extract document-topic matrix.
		Probability distributions are stored from the 2nd column onward in the document-topic distributions.
		"""
        matrix = []
        for line in self.document_topic_counts:
            matrix.append(map(float, line[1:self.model.topic_count + 1]))
        self.model.document_topic_matrix = matrix

    def extractTermTopicMatrix(self):
        """
		Extract term-topic matrix.
		Transpose the input topic-term distributions.
		Ensure all values are greater than or equal to 0.
		"""
        matrix = [[0] * self.model.topic_count] * self.model.term_count
        for j, line in enumerate(self.topic_term_counts):
            for i, value in enumerate(line):
                matrix[i][j] = max(0, float(value))
        self.model.term_topic_matrix = matrix