class PrepareDataForClient(object): """ Reformats data necessary for client to run. Extracts a subset of the complete term list and term-topic matrix and writes the subset to a separate file. Also, generates JSON file that merges/packages term information with the actual term. Input is term-topic probability distribution and term information, stored in 4 files: 'term-topic-matrix.txt' contains the entries of the matrix. 'term-index.txt' contains the terms corresponding to the rows of the matrix. 'topic-index.txt' contains the topic labels corresponding to the columns of the matrix. 'term-info.txt' contains information about individual terms. Output is a subset of terms and matrix, as well as the term subset's information. Number of files created or copied: 5 'submatrix-term-index.txt' 'submatrix-topic-index.txt' 'submatrix-term-topic.txt' 'term-info.json' 'term-info.txt' """ def __init__(self, logging_level): self.logger = logging.getLogger('PrepareDataForClient') self.logger.setLevel(logging_level) handler = logging.StreamHandler(sys.stderr) handler.setLevel(logging_level) self.logger.addHandler(handler) def execute(self, data_path): assert data_path is not None self.logger.info( '--------------------------------------------------------------------------------' ) self.logger.info('Preparing data for client...') self.logger.info(' data_path = %s', data_path) self.logger.info('Connecting to data...') self.model = ModelAPI(data_path) self.saliency = SaliencyAPI(data_path) self.seriation = SeriationAPI(data_path) self.client = ClientAPI(data_path) self.logger.info('Reading data from disk...') self.model.read() self.saliency.read() self.seriation.read() self.logger.info('Preparing parameters for seriated matrix...') self.prepareSeriatedParameters() self.logger.info('Preparing parameters for filtered matrix...') self.prepareFilteredParameters() self.logger.info('Preparing global term freqs...') self.prepareGlobalTermFreqs() self.logger.info('Writing data to disk...') self.client.write() def prepareSeriatedParameters(self): topic_index = self.model.topic_index term_index = self.model.term_index term_topic_matrix = self.model.term_topic_matrix term_ordering = self.seriation.term_ordering term_topic_submatrix = [] term_subindex = [] for term in term_ordering: if term in term_index: index = term_index.index(term) term_topic_submatrix.append(term_topic_matrix[index]) term_subindex.append(term) else: self.logger.info( 'ERROR: Term (%s) does not appear in the list of seriated terms', term) self.client.seriated_parameters = { 'termIndex': term_subindex, 'topicIndex': topic_index, 'matrix': term_topic_submatrix } def prepareFilteredParameters(self): term_rank_map = { term: value for value, term in enumerate(self.seriation.term_iter_index) } term_order_map = { term: value for value, term in enumerate(self.seriation.term_ordering) } term_saliency_map = { d['term']: d['saliency'] for d in self.saliency.term_info } term_distinctiveness_map = { d['term']: d['distinctiveness'] for d in self.saliency.term_info } self.client.filtered_parameters = { 'termRankMap': term_rank_map, 'termOrderMap': term_order_map, 'termSaliencyMap': term_saliency_map, 'termDistinctivenessMap': term_distinctiveness_map } def prepareGlobalTermFreqs(self): topic_index = self.model.topic_index term_index = self.model.term_index term_topic_matrix = self.model.term_topic_matrix term_ordering = self.seriation.term_ordering term_topic_submatrix = [] term_subindex = [] for term in term_ordering: if term in term_index: index = term_index.index(term) term_topic_submatrix.append(term_topic_matrix[index]) term_subindex.append(term) else: self.logger.info( 'ERROR: Term (%s) does not appear in the list of seriated terms', term) term_freqs = { d['term']: d['frequency'] for d in self.saliency.term_info } self.client.global_term_freqs = { 'termIndex': term_subindex, 'topicIndex': topic_index, 'matrix': term_topic_submatrix, 'termFreqMap': term_freqs }
class ComputeSaliency(object): """ Distinctiveness and saliency. Compute term distinctiveness and term saliency, based on the term probability distributions associated with a set of latent topics. Input is term-topic probability distribution, stored in 3 separate files: 'term-topic-matrix.txt' contains the entries of the matrix. 'term-index.txt' contains the terms corresponding to the rows of the matrix. 'topic-index.txt' contains the topic labels corresponding to the columns of the matrix. Output is a list of term distinctiveness and saliency values, in two duplicate formats, a tab-delimited file and a JSON object: 'term-info.txt' 'term-info.json' An auxiliary output is a list topic weights (i.e., the number of tokens in the corpus assigned to each latent topic) in two duplicate formats, a tab-delimited file and a JSON object: 'topic-info.txt' 'topic-info.json' """ def __init__(self, logging_level): self.logger = logging.getLogger('ComputeSaliency') self.logger.setLevel(logging_level) handler = logging.StreamHandler(sys.stderr) handler.setLevel(logging_level) self.logger.addHandler(handler) def execute(self, data_path): assert data_path is not None self.logger.info( '--------------------------------------------------------------------------------' ) self.logger.info('Computing term saliency...') self.logger.info(' data_path = %s', data_path) self.logger.info('Connecting to data...') self.model = ModelAPI(data_path) self.saliency = SaliencyAPI(data_path) self.logger.info('Reading data from disk...') self.model.read() self.logger.info('Computing...') self.computeTopicInfo() self.computeTermInfo() self.rankResults() self.logger.info('Writing data to disk...') self.saliency.write() self.logger.info( '--------------------------------------------------------------------------------' ) def computeTopicInfo(self): topic_weights = [sum(x) for x in zip(*self.model.term_topic_matrix)] topic_info = [] for i in range(self.model.topic_count): topic_info.append({ 'topic': self.model.topic_index[i], 'weight': topic_weights[i] }) self.saliency.topic_info = topic_info def computeTermInfo(self): """Iterate over the list of terms. Compute frequency, distinctiveness, saliency.""" topic_marginal = self.getNormalized( [d['weight'] for d in self.saliency.topic_info]) term_info = [] for i in range(self.model.term_count): term = self.model.term_index[i] counts = self.model.term_topic_matrix[i] frequency = sum(counts) probs = self.getNormalized(counts) distinctiveness = self.getKLDivergence(probs, topic_marginal) saliency = frequency * distinctiveness term_info.append({ 'term': term, 'saliency': saliency, 'frequency': frequency, 'distinctiveness': distinctiveness, 'rank': None, 'visibility': 'default' }) self.saliency.term_info = term_info def getNormalized(self, counts): """Rescale a list of counts, so they represent a proper probability distribution.""" tally = sum(counts) if tally == 0: probs = [d for d in counts] else: probs = [d / tally for d in counts] return probs def getKLDivergence(self, P, Q): """Compute KL-divergence from P to Q""" divergence = 0 assert len(P) == len(Q) for i in range(len(P)): p = P[i] q = Q[i] assert p >= 0 assert q >= 0 if p > 0: divergence += p * math.log(p / q) return divergence def rankResults(self): """Sort topics by decreasing weight. Sort term frequencies by decreasing saliency.""" self.saliency.topic_info = sorted( self.saliency.topic_info, key=lambda topic_weight: -topic_weight['weight']) self.saliency.term_info = sorted( self.saliency.term_info, key=lambda term_freq: -term_freq['saliency']) for i, element in enumerate(self.saliency.term_info): element['rank'] = i
class ComputeSaliency( object ): """ Distinctiveness and saliency. Compute term distinctiveness and term saliency, based on the term probability distributions associated with a set of latent topics. Input is term-topic probability distribution, stored in 3 separate files: 'term-topic-matrix.txt' contains the entries of the matrix. 'term-index.txt' contains the terms corresponding to the rows of the matrix. 'topic-index.txt' contains the topic labels corresponding to the columns of the matrix. Output is a list of term distinctiveness and saliency values, in two duplicate formats, a tab-delimited file and a JSON object: 'term-info.txt' 'term-info.json' An auxiliary output is a list topic weights (i.e., the number of tokens in the corpus assigned to each latent topic) in two duplicate formats, a tab-delimited file and a JSON object: 'topic-info.txt' 'topic-info.json' """ def __init__( self, logging_level ): self.logger = logging.getLogger( 'ComputeSaliency' ) self.logger.setLevel( logging_level ) handler = logging.StreamHandler( sys.stderr ) handler.setLevel( logging_level ) self.logger.addHandler( handler ) def execute( self, data_path ): assert data_path is not None self.logger.info( '--------------------------------------------------------------------------------' ) self.logger.info( 'Computing term saliency...' ) self.logger.info( ' data_path = %s', data_path ) self.logger.info( 'Connecting to data...' ) self.model = ModelAPI( data_path ) self.saliency = SaliencyAPI( data_path ) self.logger.info( 'Reading data from disk...' ) self.model.read() self.logger.info( 'Computing...' ) self.computeTopicInfo() self.computeTermInfo() self.rankResults() self.logger.info( 'Writing data to disk...' ) self.saliency.write() self.logger.info( '--------------------------------------------------------------------------------' ) def computeTopicInfo( self ): topic_weights = [ sum(x) for x in zip( *self.model.term_topic_matrix ) ] topic_info = [] for i in range(self.model.topic_count): topic_info.append( { 'topic' : self.model.topic_index[i], 'weight' : topic_weights[i] } ) self.saliency.topic_info = topic_info def computeTermInfo( self ): """Iterate over the list of terms. Compute frequency, distinctiveness, saliency.""" topic_marginal = self.getNormalized( [ d['weight'] for d in self.saliency.topic_info ] ) term_info = [] for i in range(self.model.term_count): term = self.model.term_index[i] counts = self.model.term_topic_matrix[i] frequency = sum( counts ) probs = self.getNormalized( counts ) distinctiveness = self.getKLDivergence( probs, topic_marginal ) saliency = frequency * distinctiveness term_info.append( { 'term' : term, 'saliency' : saliency, 'frequency' : frequency, 'distinctiveness' : distinctiveness, 'rank' : None, 'visibility' : 'default' } ) self.saliency.term_info = term_info def getNormalized( self, counts ): """Rescale a list of counts, so they represent a proper probability distribution.""" tally = sum( counts ) if tally == 0: probs = [ d for d in counts ] else: probs = [ d / tally for d in counts ] return probs def getKLDivergence( self, P, Q ): """Compute KL-divergence from P to Q""" divergence = 0 assert len(P) == len(Q) for i in range(len(P)): p = P[i] q = Q[i] assert p >= 0 assert q >= 0 if p > 0: divergence += p * math.log( p / q ) return divergence def rankResults( self ): """Sort topics by decreasing weight. Sort term frequencies by decreasing saliency.""" self.saliency.topic_info = sorted( self.saliency.topic_info, key = lambda topic_weight : -topic_weight['weight'] ) self.saliency.term_info = sorted( self.saliency.term_info, key = lambda term_freq : -term_freq['saliency'] ) for i, element in enumerate( self.saliency.term_info ): element['rank'] = i
class PrepareDataForClient( object ): """ Reformats data necessary for client to run. Extracts a subset of the complete term list and term-topic matrix and writes the subset to a separate file. Also, generates JSON file that merges/packages term information with the actual term. Input is term-topic probability distribution and term information, stored in 4 files: 'term-topic-matrix.txt' contains the entries of the matrix. 'term-index.txt' contains the terms corresponding to the rows of the matrix. 'topic-index.txt' contains the topic labels corresponding to the columns of the matrix. 'term-info.txt' contains information about individual terms. Output is a subset of terms and matrix, as well as the term subset's information. Number of files created or copied: 5 'submatrix-term-index.txt' 'submatrix-topic-index.txt' 'submatrix-term-topic.txt' 'term-info.json' 'term-info.txt' """ def __init__( self, logging_level ): self.logger = logging.getLogger( 'PrepareDataForClient' ) self.logger.setLevel( logging_level ) handler = logging.StreamHandler( sys.stderr ) handler.setLevel( logging_level ) self.logger.addHandler( handler ) def execute( self, data_path ): assert data_path is not None self.logger.info( '--------------------------------------------------------------------------------' ) self.logger.info( 'Preparing data for client...' ) self.logger.info( ' data_path = %s', data_path ) self.logger.info( 'Connecting to data...' ) self.model = ModelAPI( data_path ) self.saliency = SaliencyAPI( data_path ) self.seriation = SeriationAPI( data_path ) self.client = ClientAPI( data_path ) self.logger.info( 'Reading data from disk...' ) self.model.read() self.saliency.read() self.seriation.read() self.logger.info( 'Merging term information...' ) self.mergeTermInfo() self.logger.info( 'Extracting term-topic submatrix...' ) self.extractTermTopicSubmatrix() self.logger.info( 'Writing data to disk...' ) self.client.write() def mergeTermInfo( self ): # Build lookup tables term_orderings = { term: value for value, term in enumerate( self.seriation.term_ordering ) } term_iter_indexes = { term: value for value, term in enumerate( self.seriation.term_iter_index ) } term_freqs = { d['term']: d['frequency'] for d in self.saliency.term_info } term_saliencies = { d['term']: d['saliency'] for d in self.saliency.term_info } # Merge into a single object term_info = [] for term in term_orderings: ordering = term_orderings[ term ] ranking = term_iter_indexes[ term ] frequency = term_freqs[ term ] saliency = term_saliencies[ term ] term_info.append( { "term" : term, "ranking" : ranking, "ordering" : ordering, "frequency" : frequency, "saliency" : saliency } ) # Write to client API self.client.term_info = term_info def extractTermTopicSubmatrix( self ): topic_index = self.model.topic_index term_index = self.model.term_index term_topic_matrix = self.model.term_topic_matrix term_ordering = self.seriation.term_ordering term_topic_submatrix = [] term_subindex = [] for term in term_ordering: if term in term_index: index = term_index.index( term ) term_topic_submatrix.append( term_topic_matrix[ index ] ) term_subindex.append( term ) else: self.logger.info( 'ERROR: Term (%s) does not appear in the list of seriated terms', term ) self.client.term_index = term_subindex self.client.topic_index = topic_index self.client.term_topic_matrix = term_topic_submatrix
class PrepareDataForClient(object): """ Reformats data necessary for client to run. Extracts a subset of the complete term list and term-topic matrix and writes the subset to a separate file. Also, generates JSON file that merges/packages term information with the actual term. Input is term-topic probability distribution and term information, stored in 4 files: 'term-topic-matrix.txt' contains the entries of the matrix. 'term-index.txt' contains the terms corresponding to the rows of the matrix. 'topic-index.txt' contains the topic labels corresponding to the columns of the matrix. 'term-info.txt' contains information about individual terms. Output is a subset of terms and matrix, as well as the term subset's information. Number of files created or copied: 5 'submatrix-term-index.txt' 'submatrix-topic-index.txt' 'submatrix-term-topic.txt' 'term-info.json' 'term-info.txt' """ def __init__(self, logging_level): self.logger = logging.getLogger("PrepareDataForClient") self.logger.setLevel(logging_level) handler = logging.StreamHandler(sys.stderr) handler.setLevel(logging_level) self.logger.addHandler(handler) def execute(self, data_path): assert data_path is not None self.logger.info("--------------------------------------------------------------------------------") self.logger.info("Preparing data for client...") self.logger.info(" data_path = %s", data_path) self.logger.info("Connecting to data...") self.model = ModelAPI(data_path) self.saliency = SaliencyAPI(data_path) self.seriation = SeriationAPI(data_path) self.client = ClientAPI(data_path) self.logger.info("Reading data from disk...") self.model.read() self.saliency.read() self.seriation.read() self.logger.info("Preparing parameters for seriated matrix...") self.prepareSeriatedParameters() self.logger.info("Preparing parameters for filtered matrix...") self.prepareFilteredParameters() self.logger.info("Preparing global term freqs...") self.prepareGlobalTermFreqs() self.logger.info("Writing data to disk...") self.client.write() def prepareSeriatedParameters(self): topic_index = self.model.topic_index term_index = self.model.term_index term_topic_matrix = self.model.term_topic_matrix term_ordering = self.seriation.term_ordering term_topic_submatrix = [] term_subindex = [] for term in term_ordering: if term in term_index: index = term_index.index(term) term_topic_submatrix.append(term_topic_matrix[index]) term_subindex.append(term) else: self.logger.info("ERROR: Term (%s) does not appear in the list of seriated terms", term) self.client.seriated_parameters = { "termIndex": term_subindex, "topicIndex": topic_index, "matrix": term_topic_submatrix, } def prepareFilteredParameters(self): term_rank_map = {term: value for value, term in enumerate(self.seriation.term_iter_index)} term_order_map = {term: value for value, term in enumerate(self.seriation.term_ordering)} term_saliency_map = {d["term"]: d["saliency"] for d in self.saliency.term_info} term_distinctiveness_map = {d["term"]: d["distinctiveness"] for d in self.saliency.term_info} self.client.filtered_parameters = { "termRankMap": term_rank_map, "termOrderMap": term_order_map, "termSaliencyMap": term_saliency_map, "termDistinctivenessMap": term_distinctiveness_map, } def prepareGlobalTermFreqs(self): topic_index = self.model.topic_index term_index = self.model.term_index term_topic_matrix = self.model.term_topic_matrix term_ordering = self.seriation.term_ordering term_topic_submatrix = [] term_subindex = [] for term in term_ordering: if term in term_index: index = term_index.index(term) term_topic_submatrix.append(term_topic_matrix[index]) term_subindex.append(term) else: self.logger.info("ERROR: Term (%s) does not appear in the list of seriated terms", term) term_freqs = {d["term"]: d["frequency"] for d in self.saliency.term_info} self.client.global_term_freqs = { "termIndex": term_subindex, "topicIndex": topic_index, "matrix": term_topic_submatrix, "termFreqMap": term_freqs, }