def train(self, train_data_rfv):
		
		self.hcm_mdt = HcmMdt(train_data_rfv, self.k, self.vocab_size)
		self.hcm_mdt.fit(self.train_iter)
class HCM_MDT_Value_Classifier (RFVClassifier):
	#the number of hidden topics to take into account for the computation of lineraly constrained bayesian matrix factorization
	
	time_dimention = 1
	feature_dimention = 0
	def __init__(self, k, vocab_size):
		RFVClassifier.__init__(self, vocab_size)
		
		self.k = 20
		
		if k != None:
			self.k=k
		
		self.train_iter = 50; self.test_iter = 20;	
		self.hcm_mdt = None
		
	'''
	train the classifier on the given train matrix
	train_data_matrix: numpy  2-dimentional matrix
	'''
	def train(self, train_data_rfv):
		
		self.hcm_mdt = HcmMdt(train_data_rfv, self.k, self.vocab_size)
		self.hcm_mdt.fit(self.train_iter)
		
	'''
	input: testdata , feature_to_classify
	output: a matrix [nb_records, nb_values] where each row represents the different prediction confidences of each value
	'''	
	def _compute_prediction_confidences(self, test_data_rfv, id_feature_to_classify, v_groups):
		if self.hcm_mdt == None:
			raise Exception("NON TRAINED CLASSIFIER EXCEPTION: the classifier needs to be trained before if is able to make classifications")
		#fit the new data to hcm_mdt model
		[test_pi, perplexity] = self.hcm_mdt.fit_newrecords(test_data_rfv, self.test_iter)
		phi = self.hcm_mdt.P[id_feature_to_classify]

		#Pr(v|r)=sum(Pr(k|r)Pr(v|k)) = theta[r,:] . phi[:,v]
		predictions = np.dot(test_pi,phi)
		
		return predictions
	
	def compute_perplexity(self, test_data_rfv):
		if self.hcm_mdt == None:
			raise Exception("NON TRAINED CLASSIFIER EXCEPTION: the classifier needs to be trained before if is able to make classifications")
			
		
		#fit the new data to hcm_mdt model
		perplexity = self.hcm_mdt.perplexity_newrecords(test_data_rfv)
		
		return perplexity
		
	
	'''
	classify the test set elements
	test_data_rfv: test data in the rfv format
	id_feature_to_classify: the if of the feature to be classified (note that this feature must be absent from all the test records)
	v_groups: list containing the list of values that should be considered as unique classification class, first list are ids corresponding to class 1, second list to ids corresponding to class 2, ...
	return [ids , scores] where:
	ids: is a list that contains the id of the feature decided by the classifier for each vector element
	scores: is a list array that contains the scores corresponding to the decided features
	'''
	def classify_with_no_regroup(self, test_data_rfv, id_feature_to_classify,v_groups):
		
		#get the prediction confidences for the different values of the indicate feature in each record r -> [nb_records, nb_values]
		predictions = self._compute_prediction_confidences(test_data_rfv, id_feature_to_classify, v_groups)
		
		winner_ids = np.argmax(predictions, 1)
		winner_scores = np.max(predictions, 1)
		
		winner_classes = [] #transform the winner values to classes
		for v in winner_ids:
			for cid, cl in enumerate(v_groups):
				if v in cl:
					winner_classes.append(cid)
					break;
		

		return [winner_classes, winner_scores]
	
	
	'''
	classify the test set elements and then regroup the classified elements according to their group to decide a common class for them
	test_data_matrix: 2-dimentional numpy array where the columns represents the vectors to classify
	ids_features_to_classify: the ids of the hidden features that needs to be classified
	r_groups: [[id1_groupe1,...,id_n1_groupe1],...,[id_1_groupem, ..., id_nm_groupem]] group of records that get a common classification
	v_groups: list containing the list of values that should be considered as unique classification class
	return [ids , scores] where:
	ids: is a list that contains the id of the feature decided by the classifier for each vector element
	scores: is a list that contains the scores corresponding to the decided features
	'''
	def regroup_and_classify(self, test_data_rfv, id_feature_to_classify, r_groups, v_groups):
		#get a matrix of the same size than the test matrix that contains the estimated values for the features to classify, and 0 otherwise else
		predictions = self._compute_prediction_confidences(test_data_rfv, id_feature_to_classify, v_groups)
		
		#now we have a matrix where the higher the value of a feature in a record is, the more this feature is 'important' in that record.
		#We decide that summing the values of different records belonging to the same group indicates the importance of the features in that group.
		#Thus, the classified feature for one group of records is the one that has the biggest sum
		winner_ids = []
		winner_scores = []
		#for each group, take the feature corresponding to the maximal value of the sum of the records belonging to the group
		for group in r_groups:
			group_record = np.prod(predictions[group, :],0)
			winner_score = np.amax(group_record)
			winner_id = np.argmax(group_record)
			
			winner_ids.append(winner_id)
			winner_scores.append(winner_score)
		
		
		winner_classes = [] #transform the winner values to classes
		for v in winner_ids:
			for cid, cl in enumerate(v_groups):
				if v in cl:
					winner_classes.append(cid)
					break;
		

		return [winner_classes, winner_scores]
	def train(self, train_data_rfv):
		self.hcm_mdt = HcmMdt(train_data_rfv, self.k, self.vocab_size)
		self.hcm_mdt.gibbs_iter = self.train_gibbsiter; self.hcm_mdt.hyperparams_iter = self.train_hyperparamsiter
		self.hcm_mdt.compute()
Exemple #4
0
    def train(self, train_data_rfv):

        self.hcm_mdt = HcmMdt(train_data_rfv, self.k, self.vocab_size)
        self.hcm_mdt.fit(self.train_iter)
Exemple #5
0
 def train(self, train_data_rfv):
     self.hcm_mdt = HcmMdt(train_data_rfv, self.k, self.vocab_size)
     self.hcm_mdt.gibbs_iter = self.train_gibbsiter
     self.hcm_mdt.hyperparams_iter = self.train_hyperparamsiter
     self.hcm_mdt.compute()
Exemple #6
0
class HCM_MDT_Value_Classifier(RFVClassifier):
    #the number of hidden topics to take into account for the computation of lineraly constrained bayesian matrix factorization

    time_dimention = 1
    feature_dimention = 0

    def __init__(self, k, vocab_size):
        RFVClassifier.__init__(self, vocab_size)

        self.k = 20

        if k != None:
            self.k = k

        self.train_iter = 50
        self.test_iter = 20
        self.hcm_mdt = None

    '''
	train the classifier on the given train matrix
	train_data_matrix: numpy  2-dimentional matrix
	'''

    def train(self, train_data_rfv):

        self.hcm_mdt = HcmMdt(train_data_rfv, self.k, self.vocab_size)
        self.hcm_mdt.fit(self.train_iter)

    '''
	input: testdata , feature_to_classify
	output: a matrix [nb_records, nb_values] where each row represents the different prediction confidences of each value
	'''

    def _compute_prediction_confidences(self, test_data_rfv,
                                        id_feature_to_classify, v_groups):
        if self.hcm_mdt == None:
            raise Exception(
                "NON TRAINED CLASSIFIER EXCEPTION: the classifier needs to be trained before if is able to make classifications"
            )
        #fit the new data to hcm_mdt model
        [test_pi,
         perplexity] = self.hcm_mdt.fit_newrecords(test_data_rfv,
                                                   self.test_iter)
        phi = self.hcm_mdt.P[id_feature_to_classify]

        #Pr(v|r)=sum(Pr(k|r)Pr(v|k)) = theta[r,:] . phi[:,v]
        predictions = np.dot(test_pi, phi)

        return predictions

    def compute_perplexity(self, test_data_rfv):
        if self.hcm_mdt == None:
            raise Exception(
                "NON TRAINED CLASSIFIER EXCEPTION: the classifier needs to be trained before if is able to make classifications"
            )

        #fit the new data to hcm_mdt model
        perplexity = self.hcm_mdt.perplexity_newrecords(test_data_rfv)

        return perplexity

    '''
	classify the test set elements
	test_data_rfv: test data in the rfv format
	id_feature_to_classify: the if of the feature to be classified (note that this feature must be absent from all the test records)
	v_groups: list containing the list of values that should be considered as unique classification class, first list are ids corresponding to class 1, second list to ids corresponding to class 2, ...
	return [ids , scores] where:
	ids: is a list that contains the id of the feature decided by the classifier for each vector element
	scores: is a list array that contains the scores corresponding to the decided features
	'''

    def classify_with_no_regroup(self, test_data_rfv, id_feature_to_classify,
                                 v_groups):

        #get the prediction confidences for the different values of the indicate feature in each record r -> [nb_records, nb_values]
        predictions = self._compute_prediction_confidences(
            test_data_rfv, id_feature_to_classify, v_groups)

        winner_ids = np.argmax(predictions, 1)
        winner_scores = np.max(predictions, 1)

        winner_classes = []  #transform the winner values to classes
        for v in winner_ids:
            for cid, cl in enumerate(v_groups):
                if v in cl:
                    winner_classes.append(cid)
                    break

        return [winner_classes, winner_scores]

    '''
	classify the test set elements and then regroup the classified elements according to their group to decide a common class for them
	test_data_matrix: 2-dimentional numpy array where the columns represents the vectors to classify
	ids_features_to_classify: the ids of the hidden features that needs to be classified
	r_groups: [[id1_groupe1,...,id_n1_groupe1],...,[id_1_groupem, ..., id_nm_groupem]] group of records that get a common classification
	v_groups: list containing the list of values that should be considered as unique classification class
	return [ids , scores] where:
	ids: is a list that contains the id of the feature decided by the classifier for each vector element
	scores: is a list that contains the scores corresponding to the decided features
	'''

    def regroup_and_classify(self, test_data_rfv, id_feature_to_classify,
                             r_groups, v_groups):
        #get a matrix of the same size than the test matrix that contains the estimated values for the features to classify, and 0 otherwise else
        predictions = self._compute_prediction_confidences(
            test_data_rfv, id_feature_to_classify, v_groups)

        #now we have a matrix where the higher the value of a feature in a record is, the more this feature is 'important' in that record.
        #We decide that summing the values of different records belonging to the same group indicates the importance of the features in that group.
        #Thus, the classified feature for one group of records is the one that has the biggest sum
        winner_ids = []
        winner_scores = []
        #for each group, take the feature corresponding to the maximal value of the sum of the records belonging to the group
        for group in r_groups:
            group_record = np.prod(predictions[group, :], 0)
            winner_score = np.amax(group_record)
            winner_id = np.argmax(group_record)

            winner_ids.append(winner_id)
            winner_scores.append(winner_score)

        winner_classes = []  #transform the winner values to classes
        for v in winner_ids:
            for cid, cl in enumerate(v_groups):
                if v in cl:
                    winner_classes.append(cid)
                    break

        return [winner_classes, winner_scores]