def extractTopicWordWeights( self, model_path ): data = {} words = [] topics = [] # Read in content of file (sparse matrix representation) filename = '{}/{}'.format( model_path, ImportMallet.TOPIC_WORD_WEIGHTS ) with open( filename, 'r' ) as f: lines = UnicodeReader( f ) for (topic, word, value) in lines: topic = int(topic) if topic not in data: data[ topic ] = {} data[ topic ][ word ] = float(value) words.append( word ) topics.append( topic ) # Get list of terms and topic indexes term_index = sorted( list( frozenset( words ) ) ) topic_index = sorted( list( frozenset( topics ) ) ) # Build dense matrix representation matrix = [] for term in term_index : row = [] for topic in topic_index : row.append( data[ topic ][ term ] ) matrix.append( row ) # Generate topic labels topic_str_index = [ 'Topic {}'.format(d) for d in topic_index ] self.model.term_topic_matrix = matrix self.model.term_index = term_index self.model.topic_index = topic_str_index
def read( self ): self.data = {} filename = self.path + TokensAPI.TOKENS with open( filename, 'r' ) as f: lines = UnicodeReader( f ) for ( docID, docTokens ) in lines: self.data[ docID ] = docTokens.split( ' ' )
def ReadAsSparseMatrix( filename ): matrix = {} with open( filename, 'r' ) as f: lines = UnicodeReader( f ) for ( aKey, bKey, value ) in lines: matrix[ (aKey, bKey) ] = float( value ) return matrix
def ReadAsSparseVector( filename ): vector = {} with open( filename, 'r' ) as f: lines = UnicodeReader( f ) for ( key, value ) in lines: vector[ key ] = float( value ) return vector
def ReadAsMatrix( filename ): matrix = [] with open( filename, 'r' ) as f: lines = UnicodeReader( f ) for line in lines: matrix.append( map( float, line ) ) return matrix
def readCsvAsMatrixStr(self, model_path, filename): """ Return a matrix (list of list) of string values. Each row corresponds to a line of the input file. Each cell (in a row) corresponds to a comma-separated value (in each line). """ data = [] filename = '{}/{}'.format(model_path, filename) with open(filename, 'r') as f: lines = UnicodeReader(f, delimiter=',') data = [d for d in lines] return data