예제 #1
class PrepareDataForClient(object):
	Reformats data necessary for client to run. 
	Extracts a subset of the complete term list and term-topic matrix and writes
	the subset to a separate file. Also, generates JSON file that merges/packages term
	information with the actual term.
	Input is term-topic probability distribution and term information, stored in 4 files:
	    'term-topic-matrix.txt' contains the entries of the matrix.
	    'term-index.txt' contains the terms corresponding to the rows of the matrix.
	    'topic-index.txt' contains the topic labels corresponding to the columns of the matrix.
	    'term-info.txt' contains information about individual terms.
	Output is a subset of terms and matrix, as well as the term subset's information.
	Number of files created or copied: 5
    def __init__(self, logging_level):
        self.logger = logging.getLogger('PrepareDataForClient')
        handler = logging.StreamHandler(sys.stderr)

    def execute(self, data_path):

        assert data_path is not None

        self.logger.info('Preparing data for client...')
        self.logger.info('    data_path = %s', data_path)

        self.logger.info('Connecting to data...')
        self.model = ModelAPI(data_path)
        self.saliency = SaliencyAPI(data_path)
        self.seriation = SeriationAPI(data_path)
        self.client = ClientAPI(data_path)

        self.logger.info('Reading data from disk...')

        self.logger.info('Preparing parameters for seriated matrix...')

        self.logger.info('Preparing parameters for filtered matrix...')

        self.logger.info('Preparing global term freqs...')

        self.logger.info('Writing data to disk...')

    def prepareSeriatedParameters(self):
        topic_index = self.model.topic_index
        term_index = self.model.term_index
        term_topic_matrix = self.model.term_topic_matrix
        term_ordering = self.seriation.term_ordering
        term_topic_submatrix = []
        term_subindex = []
        for term in term_ordering:
            if term in term_index:
                index = term_index.index(term)
                    'ERROR: Term (%s) does not appear in the list of seriated terms',

        self.client.seriated_parameters = {
            'termIndex': term_subindex,
            'topicIndex': topic_index,
            'matrix': term_topic_submatrix

    def prepareFilteredParameters(self):
        term_rank_map = {
            term: value
            for value, term in enumerate(self.seriation.term_iter_index)
        term_order_map = {
            term: value
            for value, term in enumerate(self.seriation.term_ordering)
        term_saliency_map = {
            d['term']: d['saliency']
            for d in self.saliency.term_info
        term_distinctiveness_map = {
            d['term']: d['distinctiveness']
            for d in self.saliency.term_info

        self.client.filtered_parameters = {
            'termRankMap': term_rank_map,
            'termOrderMap': term_order_map,
            'termSaliencyMap': term_saliency_map,
            'termDistinctivenessMap': term_distinctiveness_map

    def prepareGlobalTermFreqs(self):
        topic_index = self.model.topic_index
        term_index = self.model.term_index
        term_topic_matrix = self.model.term_topic_matrix
        term_ordering = self.seriation.term_ordering
        term_topic_submatrix = []
        term_subindex = []
        for term in term_ordering:
            if term in term_index:
                index = term_index.index(term)
                    'ERROR: Term (%s) does not appear in the list of seriated terms',

        term_freqs = {
            d['term']: d['frequency']
            for d in self.saliency.term_info

        self.client.global_term_freqs = {
            'termIndex': term_subindex,
            'topicIndex': topic_index,
            'matrix': term_topic_submatrix,
            'termFreqMap': term_freqs
예제 #2
class ComputeSaliency(object):
	Distinctiveness and saliency.
	Compute term distinctiveness and term saliency, based on
	the term probability distributions associated with a set of
	latent topics.
	Input is term-topic probability distribution, stored in 3 separate files:
	    'term-topic-matrix.txt' contains the entries of the matrix.
	    'term-index.txt' contains the terms corresponding to the rows of the matrix.
	    'topic-index.txt' contains the topic labels corresponding to the columns of the matrix.
	Output is a list of term distinctiveness and saliency values,
	in two duplicate formats, a tab-delimited file and a JSON object:
	An auxiliary output is a list topic weights (i.e., the number of
	tokens in the corpus assigned to each latent topic) in two
	duplicate formats, a tab-delimited file and a JSON object:
    def __init__(self, logging_level):
        self.logger = logging.getLogger('ComputeSaliency')
        handler = logging.StreamHandler(sys.stderr)

    def execute(self, data_path):

        assert data_path is not None

        self.logger.info('Computing term saliency...')
        self.logger.info('    data_path = %s', data_path)

        self.logger.info('Connecting to data...')
        self.model = ModelAPI(data_path)
        self.saliency = SaliencyAPI(data_path)

        self.logger.info('Reading data from disk...')


        self.logger.info('Writing data to disk...')


    def computeTopicInfo(self):
        topic_weights = [sum(x) for x in zip(*self.model.term_topic_matrix)]
        topic_info = []
        for i in range(self.model.topic_count):
                'topic': self.model.topic_index[i],
                'weight': topic_weights[i]

        self.saliency.topic_info = topic_info

    def computeTermInfo(self):
        """Iterate over the list of terms. Compute frequency, distinctiveness, saliency."""

        topic_marginal = self.getNormalized(
            [d['weight'] for d in self.saliency.topic_info])
        term_info = []
        for i in range(self.model.term_count):
            term = self.model.term_index[i]
            counts = self.model.term_topic_matrix[i]
            frequency = sum(counts)
            probs = self.getNormalized(counts)
            distinctiveness = self.getKLDivergence(probs, topic_marginal)
            saliency = frequency * distinctiveness
                'term': term,
                'saliency': saliency,
                'frequency': frequency,
                'distinctiveness': distinctiveness,
                'rank': None,
                'visibility': 'default'
        self.saliency.term_info = term_info

    def getNormalized(self, counts):
        """Rescale a list of counts, so they represent a proper probability distribution."""
        tally = sum(counts)
        if tally == 0:
            probs = [d for d in counts]
            probs = [d / tally for d in counts]
        return probs

    def getKLDivergence(self, P, Q):
        """Compute KL-divergence from P to Q"""
        divergence = 0
        assert len(P) == len(Q)
        for i in range(len(P)):
            p = P[i]
            q = Q[i]
            assert p >= 0
            assert q >= 0
            if p > 0:
                divergence += p * math.log(p / q)
        return divergence

    def rankResults(self):
        """Sort topics by decreasing weight. Sort term frequencies by decreasing saliency."""
        self.saliency.topic_info = sorted(
            key=lambda topic_weight: -topic_weight['weight'])
        self.saliency.term_info = sorted(
            key=lambda term_freq: -term_freq['saliency'])
        for i, element in enumerate(self.saliency.term_info):
            element['rank'] = i
