def compute_ghcm_mdt_one_user(user_id):
	file_name = "ghcm_mdt_user_"+str(user_id)
	print "loading matrix user "+str(user_id)+"..."
	[rfvdata, featuresnames, valuesnames, recordsdates] = RVFDataExtractor.load_rvf(user_id)
	print "values" + JsonUtils.dict_as_json_str(valuesnames)
	print "data" + JsonUtils.dict_as_json_str(rfvdata[0])
	vocab_size = [len(valuesnames[f]) for f in range(len(valuesnames.keys()))]
	print "user "+str(user_id)+" has "+str(len(featuresnames))+" features and "+str(len(rfvdata))+" records"
	print "features names"
	print featuresnames
	
	print "values" + JsonUtils.dict_as_json_str(valuesnames)
	for k in [10,20,30]:
		#compute the ghcm_mdt
		ghcm_mdt_comp = GHCM_MDTComputer(rfvdata, k, vocab_size)
		print "computing SVD for user "+str(user_id)+"..."
		ghcm_mdt_comp.compute()
		
		print "constructing interpretable output for user "+str(user_id)+"..."
		ghcm_mdt_comp.construct_rows_interpretable_output(featuresnames, valuesnames, disp_m)
		r_output = ghcm_mdt_comp.rows_interpretable_output
		
		#write the result
		print "writing SVD result for user "+str(user_id)+"..."
		JsonLogsFileWriter.write(r_output, file_name)
    def train(self, train_data_rfv):

        self.ghcm_mdt = GHCM_MDTComputer(train_data_rfv, self.k,
                                         self.vocab_size)
        self.ghcm_mdt.gibbs_iter = self.train_gibbsiter
        self.ghcm_mdt.hyperparams_iter = self.train_hyperparamsiter
        self.ghcm_mdt.hyper_parameter_number_of_samples_to_average = self.nb_averages
        self.ghcm_mdt.compute()
Example #3
0
def compute_ghcm_mdt_one_user(user_id):
    file_name = "ghcm_mdt_user_" + str(user_id)
    print "loading matrix user " + str(user_id) + "..."
    [rfvdata, featuresnames, valuesnames,
     recordsdates] = RVFDataExtractor.load_rvf(user_id)
    print "values" + JsonUtils.dict_as_json_str(valuesnames)
    print "data" + JsonUtils.dict_as_json_str(rfvdata[0])
    vocab_size = [len(valuesnames[f]) for f in range(len(valuesnames.keys()))]
    print "user " + str(user_id) + " has " + str(
        len(featuresnames)) + " features and " + str(len(rfvdata)) + " records"
    print "features names"
    print featuresnames

    print "values" + JsonUtils.dict_as_json_str(valuesnames)
    for k in [10, 20, 30]:
        #compute the ghcm_mdt
        ghcm_mdt_comp = GHCM_MDTComputer(rfvdata, k, vocab_size)
        print "computing SVD for user " + str(user_id) + "..."
        ghcm_mdt_comp.compute()

        print "constructing interpretable output for user " + str(
            user_id) + "..."
        ghcm_mdt_comp.construct_rows_interpretable_output(
            featuresnames, valuesnames, disp_m)
        r_output = ghcm_mdt_comp.rows_interpretable_output

        #write the result
        print "writing SVD result for user " + str(user_id) + "..."
        JsonLogsFileWriter.write(r_output, file_name)
	def train(self, train_data_rfv):
		
		self.ghcm_mdt = GHCM_MDTComputer(train_data_rfv, self.k, self.vocab_size)
		self.ghcm_mdt.gibbs_iter = self.train_gibbsiter; self.ghcm_mdt.hyperparams_iter = self.train_hyperparamsiter;
		self.ghcm_mdt.hyper_parameter_number_of_samples_to_average = self.nb_averages
		self.ghcm_mdt.compute()
	def train(self, train_data_rfv):
		self.ghcm_mdt = GHCM_MDTComputer(train_data_rfv, self.k, self.vocab_size)
		self.ghcm_mdt.gibbs_iter = self.train_gibbsiter; self.ghcm_mdt.hyperparams_iter = self.train_hyperparamsiter
		self.ghcm_mdt.compute()
class GHCM_MDT_Value_Classifier (RFVClassifier):
	#the number of hidden topics to take into account for the computation of lineraly constrained bayesian matrix factorization
	
	time_dimention = 1
	feature_dimention = 0
	def __init__(self, k, vocab_size):
		RFVClassifier.__init__(self, vocab_size)
		
		self.k = 20
		
		if k != None:
			self.k=k
		
		self.train_gibbsiter = 40; self.nb_averages = 3;
		self.train_hyperparamsiter = 25
		self.test_hyperparamsiter = 20; self.test_gibbsiter = 1; self.test_chainnb=5;
		
			
		self.ghcm_mdt = None
		
	
	
	
	'''
	train the classifier on the given train matrix
	train_data_matrix: numpy  2-dimentional matrix
	'''
	def train(self, train_data_rfv):
		
		self.ghcm_mdt = GHCM_MDTComputer(train_data_rfv, self.k, self.vocab_size)
		self.ghcm_mdt.gibbs_iter = self.train_gibbsiter; self.ghcm_mdt.hyperparams_iter = self.train_hyperparamsiter;
		self.ghcm_mdt.hyper_parameter_number_of_samples_to_average = self.nb_averages
		self.ghcm_mdt.compute()
		
		
	'''
	input: testdata , feature_to_classify
	output: a matrix [nb_records, nb_values] where each row represents the different prediction confidences of each value
	'''	
	def _compute_prediction_confidences(self, test_data_rfv, id_feature_to_classify, v_groups):
		if self.ghcm_mdt == None:
			raise Exception("NON TRAINED CLASSIFIER EXCEPTION: the classifier needs to be trained before if is able to make classifications")
			
		
		#fit the new data to ghcm_mdt model
		[test_theta, perplexity] = self.ghcm_mdt.ghcm_mdt.fit_newrecords(test_data_rfv, self.test_hyperparamsiter, self.test_gibbsiter, self.test_chainnb)
		phi = self.ghcm_mdt.phi[id_feature_to_classify]
		
		
	
		
		#Pr(v|r)=sum(Pr(k|r)Pr(v|k)) = theta[r,:] . phi[:,v]
		predictions = np.dot(test_theta,phi)
		
		return predictions
	
	def compute_perplexity(self, test_data_rfv):
		if self.ghcm_mdt == None:
			raise Exception("NON TRAINED CLASSIFIER EXCEPTION: the classifier needs to be trained before if is able to make classifications")
			
		
		#fit the new data to ghcm_mdt model
		perplexity = self.ghcm_mdt.ghcm_mdt.perplexity_newrecords(test_data_rfv)
		
		return perplexity
		
	
	'''
	classify the test set elements
	test_data_rfv: test data in the rfv format
	id_feature_to_classify: the if of the feature to be classified (note that this feature must be absent from all the test records)
	v_groups: list containing the list of values that should be considered as unique classification class, first list are ids corresponding to class 1, second list to ids corresponding to class 2, ...
	return [ids , scores] where:
	ids: is a list that contains the id of the feature decided by the classifier for each vector element
	scores: is a list array that contains the scores corresponding to the decided features
	'''
	def classify_with_no_regroup(self, test_data_rfv, id_feature_to_classify,v_groups):
		
		#get the prediction confidences for the different values of the indicate feature in each record r -> [nb_records, nb_values]
		predictions = self._compute_prediction_confidences(test_data_rfv, id_feature_to_classify, v_groups)
		
		winner_ids = np.argmax(predictions, 1)
		winner_scores = np.max(predictions, 1)
		
		winner_classes = [] #transform the winner values to classes
		for v in winner_ids:
			for cid, cl in enumerate(v_groups):
				if v in cl:
					winner_classes.append(cid)
					break;
		

		return [winner_classes, winner_scores]
	
	
	'''
	classify the test set elements and then regroup the classified elements according to their group to decide a common class for them
	test_data_matrix: 2-dimentional numpy array where the columns represents the vectors to classify
	ids_features_to_classify: the ids of the hidden features that needs to be classified
	r_groups: [[id1_groupe1,...,id_n1_groupe1],...,[id_1_groupem, ..., id_nm_groupem]] group of records that get a common classification
	v_groups: list containing the list of values that should be considered as unique classification class
	return [ids , scores] where:
	ids: is a list that contains the id of the feature decided by the classifier for each vector element
	scores: is a list that contains the scores corresponding to the decided features
	'''
	def regroup_and_classify(self, test_data_rfv, id_feature_to_classify, r_groups, v_groups):
		#get a matrix of the same size than the test matrix that contains the estimated values for the features to classify, and 0 otherwise else
		predictions = self._compute_prediction_confidences(test_data_rfv, id_feature_to_classify, v_groups)
		
		#now we have a matrix where the higher the value of a feature in a record is, the more this feature is 'important' in that record.
		#We decide that summing the values of different records belonging to the same group indicates the importance of the features in that group.
		#Thus, the classified feature for one group of records is the one that has the biggest sum
		winner_ids = []
		winner_scores = []
		#for each group, take the feature corresponding to the maximal value of the sum of the records belonging to the group
		for group in r_groups:
			group_record = np.prod(predictions[group, :],0)
			winner_score = np.amax(group_record)
			winner_id = np.argmax(group_record)
			
			winner_ids.append(winner_id)
			winner_scores.append(winner_score)
		
		
		winner_classes = [] #transform the winner values to classes
		for v in winner_ids:
			for cid, cl in enumerate(v_groups):
				if v in cl:
					winner_classes.append(cid)
					break;
		

		return [winner_classes, winner_scores]
 def train(self, train_data_rfv):
     self.ghcm_mdt = GHCM_MDTComputer(train_data_rfv, self.k,
                                      self.vocab_size)
     self.ghcm_mdt.gibbs_iter = self.train_gibbsiter
     self.ghcm_mdt.hyperparams_iter = self.train_hyperparamsiter
     self.ghcm_mdt.compute()
class GHCM_MDT_Value_Classifier(RFVClassifier):
    #the number of hidden topics to take into account for the computation of lineraly constrained bayesian matrix factorization

    time_dimention = 1
    feature_dimention = 0

    def __init__(self, k, vocab_size):
        RFVClassifier.__init__(self, vocab_size)

        self.k = 20

        if k != None:
            self.k = k

        self.train_gibbsiter = 40
        self.nb_averages = 3
        self.train_hyperparamsiter = 25
        self.test_hyperparamsiter = 20
        self.test_gibbsiter = 1
        self.test_chainnb = 5

        self.ghcm_mdt = None

    '''
	train the classifier on the given train matrix
	train_data_matrix: numpy  2-dimentional matrix
	'''

    def train(self, train_data_rfv):

        self.ghcm_mdt = GHCM_MDTComputer(train_data_rfv, self.k,
                                         self.vocab_size)
        self.ghcm_mdt.gibbs_iter = self.train_gibbsiter
        self.ghcm_mdt.hyperparams_iter = self.train_hyperparamsiter
        self.ghcm_mdt.hyper_parameter_number_of_samples_to_average = self.nb_averages
        self.ghcm_mdt.compute()

    '''
	input: testdata , feature_to_classify
	output: a matrix [nb_records, nb_values] where each row represents the different prediction confidences of each value
	'''

    def _compute_prediction_confidences(self, test_data_rfv,
                                        id_feature_to_classify, v_groups):
        if self.ghcm_mdt == None:
            raise Exception(
                "NON TRAINED CLASSIFIER EXCEPTION: the classifier needs to be trained before if is able to make classifications"
            )

        #fit the new data to ghcm_mdt model
        [test_theta, perplexity] = self.ghcm_mdt.ghcm_mdt.fit_newrecords(
            test_data_rfv, self.test_hyperparamsiter, self.test_gibbsiter,
            self.test_chainnb)
        phi = self.ghcm_mdt.phi[id_feature_to_classify]

        #Pr(v|r)=sum(Pr(k|r)Pr(v|k)) = theta[r,:] . phi[:,v]
        predictions = np.dot(test_theta, phi)

        return predictions

    def compute_perplexity(self, test_data_rfv):
        if self.ghcm_mdt == None:
            raise Exception(
                "NON TRAINED CLASSIFIER EXCEPTION: the classifier needs to be trained before if is able to make classifications"
            )

        #fit the new data to ghcm_mdt model
        perplexity = self.ghcm_mdt.ghcm_mdt.perplexity_newrecords(
            test_data_rfv)

        return perplexity

    '''
	classify the test set elements
	test_data_rfv: test data in the rfv format
	id_feature_to_classify: the if of the feature to be classified (note that this feature must be absent from all the test records)
	v_groups: list containing the list of values that should be considered as unique classification class, first list are ids corresponding to class 1, second list to ids corresponding to class 2, ...
	return [ids , scores] where:
	ids: is a list that contains the id of the feature decided by the classifier for each vector element
	scores: is a list array that contains the scores corresponding to the decided features
	'''

    def classify_with_no_regroup(self, test_data_rfv, id_feature_to_classify,
                                 v_groups):

        #get the prediction confidences for the different values of the indicate feature in each record r -> [nb_records, nb_values]
        predictions = self._compute_prediction_confidences(
            test_data_rfv, id_feature_to_classify, v_groups)

        winner_ids = np.argmax(predictions, 1)
        winner_scores = np.max(predictions, 1)

        winner_classes = []  #transform the winner values to classes
        for v in winner_ids:
            for cid, cl in enumerate(v_groups):
                if v in cl:
                    winner_classes.append(cid)
                    break

        return [winner_classes, winner_scores]

    '''
	classify the test set elements and then regroup the classified elements according to their group to decide a common class for them
	test_data_matrix: 2-dimentional numpy array where the columns represents the vectors to classify
	ids_features_to_classify: the ids of the hidden features that needs to be classified
	r_groups: [[id1_groupe1,...,id_n1_groupe1],...,[id_1_groupem, ..., id_nm_groupem]] group of records that get a common classification
	v_groups: list containing the list of values that should be considered as unique classification class
	return [ids , scores] where:
	ids: is a list that contains the id of the feature decided by the classifier for each vector element
	scores: is a list that contains the scores corresponding to the decided features
	'''

    def regroup_and_classify(self, test_data_rfv, id_feature_to_classify,
                             r_groups, v_groups):
        #get a matrix of the same size than the test matrix that contains the estimated values for the features to classify, and 0 otherwise else
        predictions = self._compute_prediction_confidences(
            test_data_rfv, id_feature_to_classify, v_groups)

        #now we have a matrix where the higher the value of a feature in a record is, the more this feature is 'important' in that record.
        #We decide that summing the values of different records belonging to the same group indicates the importance of the features in that group.
        #Thus, the classified feature for one group of records is the one that has the biggest sum
        winner_ids = []
        winner_scores = []
        #for each group, take the feature corresponding to the maximal value of the sum of the records belonging to the group
        for group in r_groups:
            group_record = np.prod(predictions[group, :], 0)
            winner_score = np.amax(group_record)
            winner_id = np.argmax(group_record)

            winner_ids.append(winner_id)
            winner_scores.append(winner_score)

        winner_classes = []  #transform the winner values to classes
        for v in winner_ids:
            for cid, cl in enumerate(v_groups):
                if v in cl:
                    winner_classes.append(cid)
                    break

        return [winner_classes, winner_scores]