Example #1
0
	def _apply_pretransformation(self, matrix, pretransformation_name):
		transformed_trainset = np.copy(matrix)
		
		
		if pretransformation_name == "none" or pretransformation_name == None:
			#initialize the transformation scores to an array of the size of the number of features and containing all ones. This is equivalent to not having made any transformation
			transformation_scores_by_feature = np.ones(np.shape(matrix)[LCBMFComputer.feature_dimention])
		
		elif pretransformation_name == "idf":
			do_laplace_smoothing = True
			[transformed_trainset, transformation_scores_by_feature] = Numpy.idf_matrix_transformation(matrix, LCBMFComputer.time_dimention, do_laplace_smoothing)
		
		elif pretransformation_name == "ldc":
			[transformed_trainset, transformation_scores_by_feature] = Numpy.ldc_matrix_transformation(matrix, LCBMFComputer.time_dimention)
		
		elif pretransformation_name == "idc":
			do_laplace_smoothing = True
			[transformed_trainset, transformation_scores_by_feature] = Numpy.idc_matrix_transformation(matrix, LCBMFComputer.time_dimention, do_laplace_smoothing)
		
		elif pretransformation_name == "idf3":
			do_laplace_smoothing = True
			[transformed_trainset, transformation_scores_by_feature] = Numpy.idf3_matrix_transformation(matrix, LCBMFComputer.time_dimention, do_laplace_smoothing)
		
		else:
			raise Exception("WRONG TRANSFORMATION EXCEPTION : the transformation "+pretransformation_name+" do not exist")
			
		return [transformed_trainset, transformation_scores_by_feature]
Example #2
0
def transformations_comparaison_one_user(user_id):
	global file_name
	global rows_labels
	global labels_importance
	global labels_importance_derivative
	global labels_importance_rank
	global transformation_vectors
	
	labels_importance = {}
	labels_importance_derivative = {}
	labels_importance_rank = {}
	transformation_vectors = {}
	rows_labels = None
	file_name = None
	file_name = "transformations_comparaison_"+str(user_id)
	print "loading matrix user "+str(user_id)+"..."
	data_matrix = MDataExtractor.load_matrix(user_id)
	
	rows_labels =  MDataExtractor.load_labels_vector(user_id)
	columns_labels = MDataExtractor.load_time_vector(user_id)
	importance_scores = MDataExtractor.load_importance_scores(user_id)
	
	

	
	add_transformation(data_matrix, "presence_count")
	add_transformation((data_matrix*100/(np.size(data_matrix,time_dimention)*1.0)), "presence_percentage")
		
	idf_matrix = Numpy.idf_matrix_transformation(data_matrix, time_dimention, do_laplace_smoothing)
	add_transformation(idf_matrix, "idf_score")
	
	
	'''idf2_matrix = Numpy.idf2_matrix_transformation(data_matrix, time_dimention, do_laplace_smoothing)
	add_transformation(idf2_matrix, "idf2_score")'''
	
	'''idf10_matrix = Numpy.idflog10_matrix_transformation(data_matrix, time_dimention, do_laplace_smoothing)
	add_transformation(idf10_matrix, "idflog10_score")'''
	
	idf3_matrix = Numpy.idf3_matrix_transformation(data_matrix, time_dimention, do_laplace_smoothing)
	add_transformation(idf3_matrix, "idf3_score")
	
	ldc_matrix = Numpy.ldc_matrix_transformation(data_matrix, time_dimention)
	add_transformation(ldc_matrix, "ldc_score")
		
	idc_matrix = Numpy.idc_matrix_transformation(data_matrix, time_dimention, do_laplace_smoothing)
	add_transformation(idc_matrix, "idc_score")
	
	
	
	
	compare("presence_count", user_id)
Example #3
0
def add_transformation(matrix, transformation_name):
	global transformation_vectors
	sum_vector = matrix.sum(time_dimention)
	rank_vector = Numpy.ranks(sum_vector, reverse = True)
	
	transformation_vectors[transformation_name]={}
	transformation_vectors[transformation_name][0] = sum_vector
	transformation_vectors[transformation_name][1] = rank_vector
def compute_svd_one_user(user_id):
	file_name = "svd_user_"+str(user_id)
	print "loading matrix user "+str(user_id)+"..."
	data_matrix = MDataExtractor.load_matrix(user_id)
	rows_labels =  MDataExtractor.load_labels_vector(user_id)
	columns_labels = MDataExtractor.load_time_vector(user_id)
	importance_scores = MDataExtractor.load_importance_scores(user_id)
	
	print "user "+str(user_id)+" has "+str(len(rows_labels))+" features (rows) and "+str(len(columns_labels))+" realization (columns)"
	
	#do the idf / or idc transformation before computing the SVD
	print "doing idf transformation for user "+str(user_id)+"..."
	document_transformed_matrix = np.copy(data_matrix)
	[document_transformed_matrix, scores] = Numpy.idf_matrix_transformation(data_matrix, time_dimention, do_laplace_smoothing)
	#[document_transformed_matrix, scores] = Numpy.idc_matrix_transformation(data_matrix, time_dimention, do_laplace_smoothing)
	#[document_transformed_matrix, scores] = Numpy.idf3_matrix_transformation(data_matrix, time_dimention, do_laplace_smoothing)
	#[document_transformed_matrix, scores] = Numpy.ldc_matrix_transformation(data_matrix, time_dimention)
	
	term_transformed_matrix = np.ones(np.shape(data_matrix))
	#term_transformed_matrix = Numpy.ti_matrix_transformation(data_matrix, importance_scores)	
	#term_transformed_matrix = Numpy.nti_matrix_transformation(data_matrix,  importance_scores)
	
	data_matrix = document_transformed_matrix * term_transformed_matrix
	
	#compute the SVD
	svd_comp = SVDComputer(data_matrix, rows_labels, columns_labels)
	print "computing SVD for user "+str(user_id)+"..."
	svd_comp.compute_svd()
	
	print "constructing interpretable output for user "+str(user_id)+"..."
	energy_captured = svd_comp.construct_rows_interpretable_output(disp_k, disp_m)
	r_output = svd_comp.rows_interpretable_output
	
	print "the energy captured with "+str(disp_k)+" concepts is "+str(energy_captured)+" %"
	
	#write the result
	print "writing SVD result for user "+str(user_id)+"..."
	JsonLogsFileWriter.write(r_output, file_name)
	
	
	
	
	
		
	
	
	
Example #5
0
def compute_svd_one_user(user_id):
    file_name = "svd_user_" + str(user_id)
    print "loading matrix user " + str(user_id) + "..."
    data_matrix = MDataExtractor.load_matrix(user_id)
    rows_labels = MDataExtractor.load_labels_vector(user_id)
    columns_labels = MDataExtractor.load_time_vector(user_id)
    importance_scores = MDataExtractor.load_importance_scores(user_id)

    print "user " + str(user_id) + " has " + str(
        len(rows_labels)) + " features (rows) and " + str(
            len(columns_labels)) + " realization (columns)"

    #do the idf / or idc transformation before computing the SVD
    print "doing idf transformation for user " + str(user_id) + "..."
    document_transformed_matrix = np.copy(data_matrix)
    [document_transformed_matrix,
     scores] = Numpy.idf_matrix_transformation(data_matrix, time_dimention,
                                               do_laplace_smoothing)
    #[document_transformed_matrix, scores] = Numpy.idc_matrix_transformation(data_matrix, time_dimention, do_laplace_smoothing)
    #[document_transformed_matrix, scores] = Numpy.idf3_matrix_transformation(data_matrix, time_dimention, do_laplace_smoothing)
    #[document_transformed_matrix, scores] = Numpy.ldc_matrix_transformation(data_matrix, time_dimention)

    term_transformed_matrix = np.ones(np.shape(data_matrix))
    #term_transformed_matrix = Numpy.ti_matrix_transformation(data_matrix, importance_scores)
    #term_transformed_matrix = Numpy.nti_matrix_transformation(data_matrix,  importance_scores)

    data_matrix = document_transformed_matrix * term_transformed_matrix

    #compute the SVD
    svd_comp = SVDComputer(data_matrix, rows_labels, columns_labels)
    print "computing SVD for user " + str(user_id) + "..."
    svd_comp.compute_svd()

    print "constructing interpretable output for user " + str(user_id) + "..."
    energy_captured = svd_comp.construct_rows_interpretable_output(
        disp_k, disp_m)
    r_output = svd_comp.rows_interpretable_output

    print "the energy captured with " + str(disp_k) + " concepts is " + str(
        energy_captured) + " %"

    #write the result
    print "writing SVD result for user " + str(user_id) + "..."
    JsonLogsFileWriter.write(r_output, file_name)
Example #6
0
from features_cooccurance_one_user import UserFeaturesCooccurences as UFC
from pprint import *
import sys
sys.path.insert(0, "/home/dehajjik/workspace/src/utils")
from numpy_utils import Numpy as n


ufc = UFC('/speech/dbwork/mul/reco1/AppPrediction/SonyLogging/Logs/from_TKY/pulled_from_TKY/mixs_launcher_logs/json/352136065015162/all/all_in_one_log.json')
print("features")
print(ufc.features)
print("\n\nco occurences rates")
print(n.str(ufc.cooccurences_rates))
print("\n\nco occurences numbers")
print(n.str(ufc.cooccurences_number))
print("\n\noccurences rates")
print(n.str(ufc.occurences_number))
def transform_to_matrix_one_user(user_id):
	
		
	print "loading data for user "+str(user_id)
	categorized_data = DataExtractor.load_json_data(user_id)
	data = DataExtractor.complete_data(categorized_data)
	metadata = DataExtractor.complete_metadata(categorized_data)
	
	#order the data by the alphabetic name of the features
	print "ordering data "+str(user_id)
	data = collections.OrderedDict(sorted(data.items()))
	
	#get the first date and the last date
	print "getting first date and last date "
	end_date = date_min
	start_date = datetime.now()
	for feature, feature_data in data.iteritems():
		feature_data = collections.OrderedDict(sorted(feature_data.items()))
		begin_date = DataExtractor.start_date_of_realization(feature_data.keys()[0])
		if begin_date < start_date:
			start_date = begin_date
			
		last_date = DataExtractor.start_date_of_realization(feature_data.keys()[len(feature_data.keys())-1])
		if last_date > end_date:
			end_date = last_date
		
		data[feature] = feature_data
	
	#construct the data matrix
	#I- construct the matrices of all the features
	print "constructing the matrixes "
	rows = 0
	
	transformers = {} 
	for feature, feature_date in data.iteritems():
		if feature == "location":
			transformers[feature] = MatrixLocationFeatureTransformer(feature, data[feature], metadata[feature], start_date, end_date, coocurring_precision)
		elif feature == "bluetoothSeen" or feature == "bluetoothPaired":
			transformers[feature] = MatrixBleutoothFeatureTransformer(feature, data[feature], metadata[feature], start_date, end_date, coocurring_precision)
		else :
			transformers[feature] = MatrixFeatureTransformer(feature, data[feature], metadata[feature], start_date, end_date, coocurring_precision)
			
		if feature in features_importance_score_one:
			transformers[feature].let_importance_scores_to_1 = True
		
		transformers[feature].transform()
		rows += transformers[feature].nbdimentions
	
	#construct the time feature
	transformers[MatrixTimeFeatureTransformer.feature_name] = MatrixTimeFeatureTransformer(start_date, end_date, coocurring_precision)
	transformers[MatrixTimeFeatureTransformer.feature_name].transform()
	rows +=  transformers[MatrixTimeFeatureTransformer.feature_name].nbdimentions
	columns = transformers[MatrixTimeFeatureTransformer.feature_name].nbtimeslots
	
	#II-concatenate all the matrices of each feature into one big matrix (do the same for the labels vector)
	print "regrouping the matrixes "
	data_matrix = np.zeros((columns, rows))
	labels_vector = [""]* rows
	dimentions_importance_score = np.zeros(rows)
	transformers = collections.OrderedDict(sorted(transformers.items()))
	
	begin_row_idex = 0
	end_row_index = 0
	for feature, feature_transformer in transformers.iteritems():
		end_row_index = begin_row_idex + feature_transformer.nbdimentions
		data_matrix[:, begin_row_idex:end_row_index] =  feature_transformer.matrix_data
		labels_vector[begin_row_idex:end_row_index] = feature_transformer.labels_vector
		dimentions_importance_score[begin_row_idex:end_row_index]=feature_transformer.realization_importance_score
		begin_row_idex = end_row_index
	
	'''
	The matrix contains a lot of feature vectors that contains 0 in all the features except the time features.
	Those vectors corresponds to the times where any record has been done.
	We want to eliminate those timestamps and their corresponding times
	'''
	time_vector = transformers.values()[0].time_vector
	[data_matrix, time_vector] = eliminate_empty_records(data_matrix, time_vector)
	data_matrix = np.transpose(data_matrix)
	
	print "the labels are : "
	print JsonUtils.dict_as_json_str(labels_vector)
	
	
	print "first date of observation "+str(start_date)
	print "first date of observation "+str(end_date)
	print "dimension of the labels (features) vector : "+str(len(labels_vector))
	print "dimension of the time vector : "+str(len(time_vector))
	print "dimension of the resulted matrix (features, time) "+str(data_matrix.shape)
	print "the number of non zeros values is : "+str(np.count_nonzero(data_matrix))+"/"+str(np.size(data_matrix))
	print "the number of negative values in the matrix is : "+str(np.size(ma.masked_array(data_matrix, mask=(data_matrix>=0)).compressed()))
	print "the data matrix printed : "
	print Numpy.str(data_matrix)
	
	#write the matrix data
	MDataExtractor.save_matrix(user_id, data_matrix)
	
	#write the labels vector, then the time vector and the importance scores
	MDataExtractor.save_labels_vector(user_id, labels_vector)
	MDataExtractor.save_time_vector(user_id, time_vector)
	MDataExtractor.save_importance_scores(user_id, dimentions_importance_score)
Example #8
0
total_features_occurences_number = np.zeros(
    (len(UserFeaturesCooccurences.features),
     len(UserFeaturesCooccurences.features)))
result = "\n\n\nThe features ids follows the order below : \n" + pp.pformat(
    UserFeaturesCooccurences.features)
user_number = 1
for json_file in users_json_files_array:
    if os.path.isfile(json_file):
        ufc = UserFeaturesCooccurences(json_file)
        total_features_cooccurences_number = total_features_cooccurences_number.__add__(
            ufc.cooccurences_number)
        total_features_occurences_number = total_features_occurences_number.__add__(
            ufc.occurences_number)
        result = (result + "\n \n \n user " + str(user_number) + "\n" +
                  "co-occurrences number matrix:\n" +
                  n.str(ufc.cooccurences_number) +
                  "\n\nco-occurrences rate matrix:\n" +
                  n.str(ufc.cooccurences_rates))
    print("user " + str(user_number) + " extracted")
    user_number += 1

#compute the overall rate and add it to the result
total_features_cooccurences_rate = np.nan_to_num(
    (total_features_cooccurences_number *
     100).__div__(total_features_occurences_number))
result = (result + "\n \n \n overall users\n" +
          "co-occurrences number matrix:\n" +
          n.str(total_features_cooccurences_number) +
          "\n\nco-occurrences rate matrix:\n" +
          n.str(total_features_cooccurences_rate))
#array containing the path to the validated json fata for each user
users_json_files_array = [json_data_dir+x+"/all/all_in_one_validated_log.json" for x in os.listdir(json_data_dir)]

pp.pprint(users_json_files_array)
total_features_cooccurences_number = np.zeros((len(UserFeaturesCooccurences.features),len(UserFeaturesCooccurences.features)))
total_features_occurences_number = np.zeros((len(UserFeaturesCooccurences.features),len(UserFeaturesCooccurences.features)))
result = "\n\n\nThe features ids follows the order below : \n"+pp.pformat(UserFeaturesCooccurences.features)
user_number = 1
for json_file in users_json_files_array:
	if os.path.isfile(json_file):
		ufc = UserFeaturesCooccurences(json_file)
		total_features_cooccurences_number = total_features_cooccurences_number.__add__(ufc.cooccurences_number)
		total_features_occurences_number = total_features_occurences_number.__add__(ufc.occurences_number)
		result = (result+"\n \n \n user "+str(user_number)+"\n"+ "co-occurrences number matrix:\n"+
		n.str(ufc.cooccurences_number)+"\n\nco-occurrences rate matrix:\n"+n.str(ufc.cooccurences_rates))
	print("user "+str(user_number)+" extracted")
	user_number+=1

#compute the overall rate and add it to the result
total_features_cooccurences_rate = np.nan_to_num((total_features_cooccurences_number*100).__div__(total_features_occurences_number))
result = (result + "\n \n \n overall users\n"+ "co-occurrences number matrix:\n"+
		n.str(total_features_cooccurences_number)+"\n\nco-occurrences rate matrix:\n"+
		n.str(total_features_cooccurences_rate))

#write an explanation about the results
comment = ("This file represents the co-occurrences of the different features. \n"+  
"For each user, 2 matrices are shown:\n"+ 
	"- cooccurences_number: each cell (i,j) represents the number of co-occurrences \n"+ 
	"that feature i(row) and j(column) has. This matrix is thus diagonal.\n"+ 
	"- cooccurences_rates: each cell (i,j) represents the percentage over the number of appearence\n"+