class ImportMallet( object ): """ Copies mallet file formats into Termite internal format. """ # Files generated by Mallet TOPIC_WORD_WEIGHTS = 'topic-word-weights.txt' def __init__( self, logging_level ): self.logger = logging.getLogger( 'ImportMallet' ) self.logger.setLevel( logging_level ) handler = logging.StreamHandler( sys.stderr ) handler.setLevel( logging_level ) self.logger.addHandler( handler ) def execute( self, model_library, model_path, data_path ): assert model_library is not None assert model_library == 'mallet' assert model_path is not None assert data_path is not None self.logger.info( '--------------------------------------------------------------------------------' ) self.logger.info( 'Importing a Mallet model...' ) self.logger.info( ' topic model = %s (%s)', model_path, model_library ) self.logger.info( ' output = %s', data_path ) self.logger.info( 'Connecting to data...' ) self.model = ModelAPI( data_path ) self.logger.info( 'Reading "%s" from Mallet...', ImportMallet.TOPIC_WORD_WEIGHTS ) self.extractTopicWordWeights( model_path ) self.logger.info( 'Writing data to disk...' ) self.model.write() self.logger.info( '--------------------------------------------------------------------------------' ) def extractTopicWordWeights( self, model_path ): data = {} words = [] topics = [] # Read in content of file (sparse matrix representation) filename = '{}/{}'.format( model_path, ImportMallet.TOPIC_WORD_WEIGHTS ) with open( filename, 'r' ) as f: lines = UnicodeReader( f ) for (topic, word, value) in lines: topic = int(topic) if topic not in data: data[ topic ] = {} data[ topic ][ word ] = float(value) words.append( word ) topics.append( topic ) # Get list of terms and topic indexes term_index = sorted( list( frozenset( words ) ) ) topic_index = sorted( list( frozenset( topics ) ) ) # Build dense matrix representation matrix = [] for term in term_index : row = [] for topic in topic_index : row.append( data[ topic ][ term ] ) matrix.append( row ) # Generate topic labels topic_str_index = [ 'Topic {}'.format(d) for d in topic_index ] self.model.term_topic_matrix = matrix self.model.term_index = term_index self.model.topic_index = topic_str_index
class ImportStmt( object ): """ Copies STMT file formats into Termite internal format. """ # Files generated by STMT TERM_INDEX = 'term-index.txt' TOPIC_INDEX = 'topic-index.txt' DOCUMENT_INDEX = 'doc-index.txt' TOPIC_TERM = 'topic-term-distributions.csv' DOCUMENT_TOPIC = 'document-topic-distributions.csv' def __init__( self, logging_level ): self.logger = logging.getLogger( 'ImportStmt' ) self.logger.setLevel( logging_level ) handler = logging.StreamHandler( sys.stderr ) handler.setLevel( logging_level ) self.logger.addHandler( handler ) def execute( self, model_library, model_path, data_path ): assert model_library is not None assert model_library == 'stmt' assert model_path is not None assert data_path is not None self.logger.info( '--------------------------------------------------------------------------------' ) self.logger.info( 'Importing an STMT model...' ) self.logger.info( ' topic model = %s (%s)', model_path, model_library ) self.logger.info( ' output = %s', data_path ) self.logger.info( 'Connecting to data...' ) self.model = ModelAPI( data_path ) self.logger.info( 'Reading "%s" from STMT output...', ImportStmt.TERM_INDEX ) self.model.term_index = self.readAsList( model_path, ImportStmt.TERM_INDEX ) self.model.term_count = len(self.model.term_index) self.logger.info( 'Reading "%s" from STMT output...', ImportStmt.TOPIC_INDEX ) self.model.topic_index = self.readAsList( model_path, ImportStmt.TOPIC_INDEX ) self.model.topic_count = len(self.model.topic_index) self.logger.info( 'Reading "%s" from STMT output...', ImportStmt.DOCUMENT_INDEX ) self.model.document_index = self.readAsList( model_path, ImportStmt.DOCUMENT_INDEX ) self.model.document_count = len(self.model.document_index) self.logger.info( 'Reading "%s" from STMT output...', ImportStmt.TOPIC_TERM ) self.topic_term_counts = self.readCsvAsMatrixStr( model_path, ImportStmt.TOPIC_TERM ) self.logger.info( 'Reading "%s" from STMT output...', ImportStmt.DOCUMENT_TOPIC ) self.document_topic_counts = self.readCsvAsMatrixStr( model_path, ImportStmt.DOCUMENT_TOPIC ) self.logger.info( 'Extracting term-topic matrix...' ) self.extractTermTopicMatrix() self.logger.info( 'Extracting document-topic matrix...' ) self.extractDocumentTopicMatrix() self.logger.info( 'Writing data to disk...' ) self.model.write() def readAsList( self, model_path, filename ): data = [] filename = '{}/{}'.format( model_path, filename ) with open( filename, 'r' ) as f: data = f.read().decode( 'utf-8' ).splitlines() return data # Need for STMT, which generates a mixed-string-float document-topic-distributions.csv file def readCsvAsMatrixStr( self, model_path, filename ): """ Return a matrix (list of list) of string values. Each row corresponds to a line of the input file. Each cell (in a row) corresponds to a comma-separated value (in each line). """ data = [] filename = '{}/{}'.format( model_path, filename ) with open( filename, 'r' ) as f: lines = UnicodeReader( f, delimiter = ',' ) data = [ d for d in lines ] return data def extractDocumentTopicMatrix( self ): """ Extract document-topic matrix. Probability distributions are stored from the 2nd column onward in the document-topic distributions. """ matrix = [] for line in self.document_topic_counts: matrix.append( map( float, line[1:self.model.topic_count+1] ) ) self.model.document_topic_matrix = matrix def extractTermTopicMatrix( self ): """ Extract term-topic matrix. Transpose the input topic-term distributions. Ensure all values are greater than or equal to 0. """ matrix = [] for i in range(self.model.term_count): matrix.append( [ max(0, float(x[i])) for x in self.topic_term_counts] ) self.model.term_topic_matrix = matrix
class ImportStmt(object): """ Copies STMT file formats into Termite internal format. """ # Files generated by STMT TERM_INDEX = 'term-index.txt' TOPIC_INDEX = 'topic-index.txt' DOCUMENT_INDEX = 'doc-index.txt' TOPIC_TERM = 'topic-term-distributions.csv' DOCUMENT_TOPIC = 'document-topic-distributions.csv' def __init__(self, logging_level): self.logger = logging.getLogger('ImportStmt') self.logger.setLevel(logging_level) handler = logging.StreamHandler(sys.stderr) handler.setLevel(logging_level) self.logger.addHandler(handler) def execute(self, model_library, model_path, data_path): assert model_library is not None assert model_library == 'stmt' assert model_path is not None assert data_path is not None self.logger.info( '--------------------------------------------------------------------------------' ) self.logger.info('Importing an STMT model...') self.logger.info(' topic model = %s (%s)', model_path, model_library) self.logger.info(' output = %s', data_path) self.logger.info('Connecting to data...') self.model = ModelAPI(data_path) self.logger.info('Reading "%s" from STMT output...', ImportStmt.TERM_INDEX) self.model.term_index = self.readAsList(model_path, ImportStmt.TERM_INDEX) self.model.term_count = len(self.model.term_index) self.logger.info('Reading "%s" from STMT output...', ImportStmt.TOPIC_INDEX) self.model.topic_index = self.readAsList(model_path, ImportStmt.TOPIC_INDEX) self.model.topic_count = len(self.model.topic_index) self.logger.info('Reading "%s" from STMT output...', ImportStmt.DOCUMENT_INDEX) self.model.document_index = self.readAsList(model_path, ImportStmt.DOCUMENT_INDEX) self.model.document_count = len(self.model.document_index) self.logger.info('Reading "%s" from STMT output...', ImportStmt.TOPIC_TERM) self.topic_term_counts = self.readCsvAsMatrixStr( model_path, ImportStmt.TOPIC_TERM) self.logger.info('Reading "%s" from STMT output...', ImportStmt.DOCUMENT_TOPIC) self.document_topic_counts = self.readCsvAsMatrixStr( model_path, ImportStmt.DOCUMENT_TOPIC) self.logger.info('Extracting term-topic matrix...') self.extractTermTopicMatrix() self.logger.info('Extracting document-topic matrix...') self.extractDocumentTopicMatrix() self.logger.info('Writing data to disk...') self.model.write() def readAsList(self, model_path, filename): data = [] filename = '{}/{}'.format(model_path, filename) with open(filename, 'r') as f: data = f.read().decode('utf-8').splitlines() return data # Need for STMT, which generates a mixed-string-float document-topic-distributions.csv file def readCsvAsMatrixStr(self, model_path, filename): """ Return a matrix (list of list) of string values. Each row corresponds to a line of the input file. Each cell (in a row) corresponds to a comma-separated value (in each line). """ data = [] filename = '{}/{}'.format(model_path, filename) with open(filename, 'r') as f: lines = UnicodeReader(f, delimiter=',') data = [d for d in lines] return data def extractDocumentTopicMatrix(self): """ Extract document-topic matrix. Probability distributions are stored from the 2nd column onward in the document-topic distributions. """ matrix = [] for line in self.document_topic_counts: matrix.append(map(float, line[1:self.model.topic_count + 1])) self.model.document_topic_matrix = matrix def extractTermTopicMatrix(self): """ Extract term-topic matrix. Transpose the input topic-term distributions. Ensure all values are greater than or equal to 0. """ matrix = [[0] * self.model.topic_count] * self.model.term_count for j, line in enumerate(self.topic_term_counts): for i, value in enumerate(line): matrix[i][j] = max(0, float(value)) self.model.term_topic_matrix = matrix