def train(self, train_data_rv):

        self.lda = Lda(train_data_rv, self.k, self.vocab_size)
        self.lda.gibbs_iter = self.train_gibbsiter
        self.lda.hyperparams_iter = self.train_hyperparamsiter
        self.lda.hyper_parameter_number_of_samples_to_average = self.nb_averages
        self.lda.fit(self.train_hyperparamsiter, self.train_gibbsiter)
Example #2
0
 def update_lda(self,
                cluster,
                data,
                data_lens,
                n_cat,
                n_em_itr,
                labels=None):
     n_states = cluster.n_states
     lda = self.lda_mdl(n_states, n_cat)
     if lda is None:
         states = cluster.expt_s.argmax(0)
         s_list = [
             states[b:e] for b, e in _data_len_ibgn_iend_itr(data_lens)
         ]
         lda = Lda(n_states, n_cat)
         n_batches = len(data_lens)
         lda.set_default_params(n_batches)
         lda.init_expt_z(data_lens)
     lda.update(s_list, n_em_itr)
     self._lda_dic[(n_states, n_cat)] = lda
     return lda
Example #3
0
if __name__ == '__main__':
    # Settings
    use_cache = False
    update_html = True

    # Retrieve and prepare dataset
    newsapi = NewsboxApi()
    published_since_n_days = (datetime.date.today() - datetime.timedelta(days=PUBLISHED_BEFORE_N_DAYS)).isoformat()
    articles = newsapi.list_articles(language=LANGUAGE,
                                     published_after=published_since_n_days,
                                     from_cache=use_cache)
    texts = prepare_articles(articles=articles, from_cache=use_cache)

    # Train LDA
    lda = Lda()
    lda.train_lda(texts=texts, num_topics=NUM_TOPIC)
    lda.persist_lda()
    lda.export_html()
    # lda.visualize()

    # Update lda html for newsmap
    if update_html:
        minioapi = MinioApi()
        bucket_name = 'newsmap'
        minioapi.create_bucket(bucket_name=bucket_name)
        minioapi.upload_file(bucket_name=bucket_name,
                             filename='index.html',
                             file='artifacts/lda/index.html')
        minioapi.make_public(bucket_name=bucket_name)
Example #4
0
    def __init__(self, trdata_path, vadata_path, tedata_path, trtopic_path, vatopic_path, tetopic_path, beta_path, word_map_path):
        ## Parameters settings
        self.train_iters = 100
        self.infer_iters = 10
        self.grad_iters = 100
        self.K = 40
        self.max_words = 10000
        self.l_u = 0.1
        self.l_i = 0.1
        self.init_topic = 0        # 0=init from files, 1=init from implemented by us

        ## Reading review data
        self.trdata_path = trdata_path
        self.vadata_path = vadata_path
        self.tedata_path = tedata_path
        self.corp = Corpus(trdata_path, vadata_path, tedata_path, self.max_words)
        self.n_users = self.corp.n_users
        self.n_items = self.corp.n_items
        self.n_words = self.corp.n_words
        print 'N_users=%d, N_items=%d, N_words=%d, N_trreviews=%d' % (self.n_users, self.n_items, self.n_words, len(self.corp.train_votes))

        ## Model parameter allocation
        self.theta_user = np.random.random((self.K, self.n_users))
        self.theta_item = np.random.random((self.K, self.n_items))
        self.phi1_review = np.zeros((self.K, len(self.corp.train_votes)))
        self.phi2_review = np.zeros((self.K, len(self.corp.vali_votes)))
        self.phi3_review = np.zeros((self.K, len(self.corp.test_votes)))
        self.beta_kw = np.zeros((self.n_words, self.K))
        self.log_beta_kw = np.zeros((self.n_words, self.K))
        if self.init_topic == 0:
            for i, line in enumerate(open(trtopic_path)):
                parts = line.strip("\r\t\n")[:-1].split(" ")
                parts = map(float, parts)
                self.phi1_review[:, i] = parts
                self.phi1_review[:, i] /= np.sum(self.phi1_review[:, i])
            for i, line in enumerate(open(vatopic_path)):
                parts = line.strip("\r\t\n")[:-1].split(" ")
                parts = map(float, parts)
                self.phi2_review[:, i] = parts
                self.phi2_review[:, i] /= np.sum(self.phi2_review[:, i])
            for i, line in enumerate(open(tetopic_path)):
                parts = line.strip("\r\t\n")[:-1].split(" ")
                parts = map(float, parts)
                self.phi3_review[:, i] = parts
                self.phi3_review[:, i] /= np.sum(self.phi3_review[:, i])
            id_map_id = [-1 for i in xrange(self.n_words)]
            for i, line in enumerate(open(word_map_path)):
                if i == 0:
                    continue
                parts = line.strip("\r\t\n").split(" ")
                word = parts[0]
                r_widx = int(parts[1])
                if word not in self.corp.word_ids:
                    print 'Word mismatch'
                    sys.exit(1)
                widx = self.corp.word_ids[word]
                id_map_id[r_widx] = widx
            for i, line in enumerate(open(beta_path)):
                parts = line.strip("\r\t\n")[:-1].split(" ")
                parts = np.array(map(float, parts))
                self.beta_kw[:,i] = parts[id_map_id]
                self.beta_kw[:,i] /= np.sum(self.beta_kw[:,i])
            self.log_beta_kw = np.log(self.beta_kw)
        elif self.init_topic == 1:
            lda = Lda(self.K)
            (self.phi1_review, self.beta_kw) = lda.train(self.corp.train_votes, self.n_words)
            self.log_beta_kw = np.log(self.beta_kw)
            self.phi2_review = lda.inference(self.corp.vali_votes)
            self.phi3_review = lda.inference(self.corp.test_votes)
        else:
            print 'Invalid choice topic init method'
            sys.exit(1)

        self.train_votes_puser = [[] for u in xrange(self.n_users)]
        self.train_votes_pitem = [[] for i in xrange(self.n_items)]
        for i, vote in enumerate(self.corp.train_votes):
            uidx = vote.user
            iidx = vote.item
            self.train_votes_puser[uidx].append(i)
            self.train_votes_pitem[iidx].append(i)
        self.probValCheck()
        print "Finished model preprocessing"
class LDA_Value_Classifier(RVClassifier):
    #the number of hidden topics to take into account for the computation of lineraly constrained bayesian matrix factorization

    def __init__(self, k, vocab_size):
        RVClassifier.__init__(self, vocab_size)

        self.k = 20

        if k != None:
            self.k = k

        self.train_gibbsiter = 40
        self.nb_averages = 3
        self.train_hyperparamsiter = 25
        self.test_hyperparamsiter = 20
        self.test_gibbsiter = 1
        self.test_chainnb = 5

        self.lda = None

    '''
	train the classifier on the given train matrix
	train_data_matrix: numpy  2-dimentional matrix
	'''

    def train(self, train_data_rv):

        self.lda = Lda(train_data_rv, self.k, self.vocab_size)
        self.lda.gibbs_iter = self.train_gibbsiter
        self.lda.hyperparams_iter = self.train_hyperparamsiter
        self.lda.hyper_parameter_number_of_samples_to_average = self.nb_averages
        self.lda.fit(self.train_hyperparamsiter, self.train_gibbsiter)
        #pdb.set_trace()

    '''
	input: testdata , feature_to_classify
	output: a matrix [nb_records, nb_values] where each row represents the different prediction confidences of each value
	'''

    def _compute_prediction_confidences(self, test_data_rv,
                                        ids_realizations_to_classify,
                                        v_groups):
        if self.lda == None:
            raise Exception(
                "NON TRAINED CLASSIFIER EXCEPTION: the classifier needs to be trained before if is able to make classifications"
            )

        #fit the new data to lda model
        [test_theta, perplexity
         ] = self.lda.fit_newrecords(test_data_rv, self.test_hyperparamsiter,
                                     self.test_gibbsiter, self.test_chainnb)
        phi = self.lda.phi[:, ids_realizations_to_classify]
        #pdb.set_trace()
        #Pr(v|r)=sum(Pr(k|r)Pr(v|k)) = theta[r,:] . phi[:,v]
        predictions = np.dot(test_theta, phi)

        return predictions

    def compute_perplexity(self, test_data_rfv):
        raise Exception("Not implemented exception")
        return None

    '''
	classify the test set elements
	test_data_rfv: test data in the rfv format
	ids_realizations_to_classify: the if of the feature to be classified (note that this feature must be absent from all the test records)
	v_groups: list containing the list of values that should be considered as unique classification class, first list are ids corresponding to class 1, second list to ids corresponding to class 2, ...
	return [ids , scores] where:
	ids: is a list that contains the id of the feature decided by the classifier for each vector element
	scores: is a list array that contains the scores corresponding to the decided features
	'''

    def classify_with_no_regroup(self, test_data_rfv,
                                 ids_realizations_to_classify, v_groups):

        #get the prediction confidences for the different values of the indicate feature in each record r -> [nb_records, nb_values]
        predictions = self._compute_prediction_confidences(
            test_data_rfv, ids_realizations_to_classify, v_groups)

        winner_ranks = np.argmax(predictions, 1)
        winner_ids = np.asarray(
            [ids_realizations_to_classify[rank] for rank in winner_ranks])
        winner_scores = np.max(predictions, 1)

        winner_classes = []  #transform the winner values to classes
        for v in winner_ids:
            for cid, cl in enumerate(v_groups):
                if v in cl:
                    winner_classes.append(cid)
                    break

        return [winner_classes, winner_scores]

    '''
	classify the test set elements and then regroup the classified elements according to their group to decide a common class for them
	test_data_matrix: 2-dimentional numpy array where the columns represents the vectors to classify
	ids_features_to_classify: the ids of the hidden features that needs to be classified
	r_groups: [[id1_groupe1,...,id_n1_groupe1],...,[id_1_groupem, ..., id_nm_groupem]] group of records that get a common classification
	v_groups: list containing the list of values that should be considered as unique classification class
	return [ids , scores] where:
	ids: is a list that contains the id of the feature decided by the classifier for each vector element
	scores: is a list that contains the scores corresponding to the decided features
	'''

    def regroup_and_classify(self, test_data_rfv, ids_realizations_to_classify,
                             r_groups, v_groups):
        #get a matrix of the same size than the test matrix that contains the estimated values for the features to classify, and 0 otherwise else
        predictions = self._compute_prediction_confidences(
            test_data_rfv, ids_realizations_to_classify, v_groups)

        winner_ranks = []
        winner_scores = []
        #for each group, take the feature corresponding to the maximal value of the sum of the records belonging to the group
        for group in r_groups:
            group_record = np.prod(predictions[group, :], 0)
            winner_score = np.amax(group_record)
            winner_id = np.argmax(group_record)

            winner_ranks.append(winner_id)
            winner_scores.append(winner_score)

        winner_ids = np.asarray(
            [ids_realizations_to_classify[rank] for rank in winner_ranks])

        winner_classes = []  #transform the winner values to classes
        for v in winner_ids:
            for cid, cl in enumerate(v_groups):
                if v in cl:
                    winner_classes.append(cid)
                    break

        return [winner_classes, winner_scores]
	def train(self, train_data_rv):
		
		self.lda = Lda(train_data_rv, self.k, self.vocab_size)
		self.lda.gibbs_iter = self.train_gibbsiter; self.lda.hyperparams_iter = self.train_hyperparamsiter;
		self.lda.hyper_parameter_number_of_samples_to_average = self.nb_averages
		self.lda.fit(self.train_hyperparamsiter, self.train_gibbsiter)
class LDA_Value_Classifier (RVClassifier):
	#the number of hidden topics to take into account for the computation of lineraly constrained bayesian matrix factorization
	
	def __init__(self, k, vocab_size):
		RVClassifier.__init__(self, vocab_size)
		
		self.k = 20
		
		if k != None:
			self.k=k
		
		self.train_gibbsiter = 40; self.nb_averages = 3;
		self.train_hyperparamsiter = 25
		self.test_hyperparamsiter = 20; self.test_gibbsiter = 1; self.test_chainnb=5;
		
			
		self.lda = None
		
	
	
	
	'''
	train the classifier on the given train matrix
	train_data_matrix: numpy  2-dimentional matrix
	'''
	def train(self, train_data_rv):
		
		self.lda = Lda(train_data_rv, self.k, self.vocab_size)
		self.lda.gibbs_iter = self.train_gibbsiter; self.lda.hyperparams_iter = self.train_hyperparamsiter;
		self.lda.hyper_parameter_number_of_samples_to_average = self.nb_averages
		self.lda.fit(self.train_hyperparamsiter, self.train_gibbsiter)
		#pdb.set_trace()
		
	'''
	input: testdata , feature_to_classify
	output: a matrix [nb_records, nb_values] where each row represents the different prediction confidences of each value
	'''	
	def _compute_prediction_confidences(self, test_data_rv, ids_realizations_to_classify, v_groups):
		if self.lda == None:
			raise Exception("NON TRAINED CLASSIFIER EXCEPTION: the classifier needs to be trained before if is able to make classifications")
			
		
		#fit the new data to lda model
		[test_theta, perplexity] = self.lda.fit_newrecords(test_data_rv, self.test_hyperparamsiter, self.test_gibbsiter, self.test_chainnb)
		phi = self.lda.phi[:,ids_realizations_to_classify]
		#pdb.set_trace()
		#Pr(v|r)=sum(Pr(k|r)Pr(v|k)) = theta[r,:] . phi[:,v]
		predictions = np.dot(test_theta,phi)
		
		return predictions
	
	def compute_perplexity(self, test_data_rfv):
		raise Exception("Not implemented exception")
		return None
		
	
	'''
	classify the test set elements
	test_data_rfv: test data in the rfv format
	ids_realizations_to_classify: the if of the feature to be classified (note that this feature must be absent from all the test records)
	v_groups: list containing the list of values that should be considered as unique classification class, first list are ids corresponding to class 1, second list to ids corresponding to class 2, ...
	return [ids , scores] where:
	ids: is a list that contains the id of the feature decided by the classifier for each vector element
	scores: is a list array that contains the scores corresponding to the decided features
	'''
	def classify_with_no_regroup(self, test_data_rfv, ids_realizations_to_classify,v_groups):
		
		#get the prediction confidences for the different values of the indicate feature in each record r -> [nb_records, nb_values]
		predictions = self._compute_prediction_confidences(test_data_rfv, ids_realizations_to_classify, v_groups)
		
		winner_ranks = np.argmax(predictions, 1)
		winner_ids = np.asarray([ids_realizations_to_classify[rank] for rank in winner_ranks])
		winner_scores = np.max(predictions, 1)
		
		winner_classes = [] #transform the winner values to classes
		for v in winner_ids:
			for cid, cl in enumerate(v_groups):
				if v in cl:
					winner_classes.append(cid)
					break;
		

		return [winner_classes, winner_scores]
	
	
	'''
	classify the test set elements and then regroup the classified elements according to their group to decide a common class for them
	test_data_matrix: 2-dimentional numpy array where the columns represents the vectors to classify
	ids_features_to_classify: the ids of the hidden features that needs to be classified
	r_groups: [[id1_groupe1,...,id_n1_groupe1],...,[id_1_groupem, ..., id_nm_groupem]] group of records that get a common classification
	v_groups: list containing the list of values that should be considered as unique classification class
	return [ids , scores] where:
	ids: is a list that contains the id of the feature decided by the classifier for each vector element
	scores: is a list that contains the scores corresponding to the decided features
	'''
	def regroup_and_classify(self, test_data_rfv, ids_realizations_to_classify, r_groups, v_groups):
		#get a matrix of the same size than the test matrix that contains the estimated values for the features to classify, and 0 otherwise else
		predictions = self._compute_prediction_confidences(test_data_rfv, ids_realizations_to_classify, v_groups)
		
		winner_ranks = []
		winner_scores = []
		#for each group, take the feature corresponding to the maximal value of the sum of the records belonging to the group
		for group in r_groups:
			group_record = np.prod(predictions[group, :],0)
			winner_score = np.amax(group_record)
			winner_id = np.argmax(group_record)
			
			winner_ranks.append(winner_id)
			winner_scores.append(winner_score)
		
		winner_ids = np.asarray([ids_realizations_to_classify[rank] for rank in winner_ranks])
		
		winner_classes = [] #transform the winner values to classes
		for v in winner_ids:
			for cid, cl in enumerate(v_groups):
				if v in cl:
					winner_classes.append(cid)
					break;
		

		return [winner_classes, winner_scores]