Example #1
0
def compute_ghcm_mdt_one_user(user_id):
    file_name = "ghcm_mdt_user_" + str(user_id)
    print "loading matrix user " + str(user_id) + "..."
    [rfvdata, featuresnames, valuesnames,
     recordsdates] = RVFDataExtractor.load_rvf(user_id)
    print "values" + JsonUtils.dict_as_json_str(valuesnames)
    print "data" + JsonUtils.dict_as_json_str(rfvdata[0])
    vocab_size = [len(valuesnames[f]) for f in range(len(valuesnames.keys()))]
    print "user " + str(user_id) + " has " + str(
        len(featuresnames)) + " features and " + str(len(rfvdata)) + " records"
    print "features names"
    print featuresnames

    print "values" + JsonUtils.dict_as_json_str(valuesnames)
    for k in [10, 20, 30]:
        #compute the ghcm_mdt
        ghcm_mdt_comp = GHCM_MDTComputer(rfvdata, k, vocab_size)
        print "computing SVD for user " + str(user_id) + "..."
        ghcm_mdt_comp.compute()

        print "constructing interpretable output for user " + str(
            user_id) + "..."
        ghcm_mdt_comp.construct_rows_interpretable_output(
            featuresnames, valuesnames, disp_m)
        r_output = ghcm_mdt_comp.rows_interpretable_output

        #write the result
        print "writing SVD result for user " + str(user_id) + "..."
        JsonLogsFileWriter.write(r_output, file_name)
def compute_ghcm_mdt_one_user(user_id):
	file_name = "ghcm_mdt_user_"+str(user_id)
	print "loading matrix user "+str(user_id)+"..."
	[rfvdata, featuresnames, valuesnames, recordsdates] = RVFDataExtractor.load_rvf(user_id)
	print "values" + JsonUtils.dict_as_json_str(valuesnames)
	print "data" + JsonUtils.dict_as_json_str(rfvdata[0])
	vocab_size = [len(valuesnames[f]) for f in range(len(valuesnames.keys()))]
	print "user "+str(user_id)+" has "+str(len(featuresnames))+" features and "+str(len(rfvdata))+" records"
	print "features names"
	print featuresnames
	
	print "values" + JsonUtils.dict_as_json_str(valuesnames)
	for k in [10,20,30]:
		#compute the ghcm_mdt
		ghcm_mdt_comp = GHCM_MDTComputer(rfvdata, k, vocab_size)
		print "computing SVD for user "+str(user_id)+"..."
		ghcm_mdt_comp.compute()
		
		print "constructing interpretable output for user "+str(user_id)+"..."
		ghcm_mdt_comp.construct_rows_interpretable_output(featuresnames, valuesnames, disp_m)
		r_output = ghcm_mdt_comp.rows_interpretable_output
		
		#write the result
		print "writing SVD result for user "+str(user_id)+"..."
		JsonLogsFileWriter.write(r_output, file_name)
	def test_time_variances_for_array_feature(data, array_feature):
		time_variances_number = {}
		time_variances_feature_min = {}
		time_variances_feature_max = {}

		for record_id in data:
			if array_feature in data[record_id]:
				max_time = 0
				min_time = sys.maxint
				feature = data[record_id][array_feature]
				for entry in feature:
					current_time = long(entry['createDate'])
					if current_time >= max_time :
						max_time = current_time
							
					if current_time <= min_time:
						min_time = current_time
			
							
				time_variance = max_time - min_time
				if time_variance not in time_variances_number:
					time_variances_number[time_variance] = 0
				
				time_variances_number[time_variance] +=1
					
		time_variances_number = collections.OrderedDict(sorted(time_variances_number.items(),reverse=True))
		
		print "time variances distribution for "+array_feature
		print JsonUtils.dict_as_json_str(time_variances_number)
		print "\n \n"
	def test_time_variances_in_one_record(data):
		time_variances_number = {}
		time_variances_feature_min = {}
		time_variances_feature_max = {}

		for record_id in data:
			max_time = 0
			min_time = sys.maxint
			max_feature=""
			min_feature = ""
			record = data[record_id]
			
			for feature, value in record.iteritems():
				
				try:
					current_time = long(value['createDate'])
						
					if current_time > max_time :
						max_time = current_time
						max_feature = feature
					
					if current_time < min_time:
						min_time = current_time
						min_feature = feature
						
				except TypeError:
					#it is an array feature
					
					for entry in value:
						current_time = long(entry['createDate'])
						if current_time >= max_time :
							max_time = current_time
							max_feature = feature
							
						if current_time <= min_time:
							min_time = current_time
							min_feature = feature
							
							
			time_variance = max_time - min_time
			if time_variance not in time_variances_number:
				time_variances_number[time_variance] = 0
				
			time_variances_number[time_variance] +=1
			
			if max_feature not in time_variances_feature_max:
				time_variances_feature_max[max_feature] = 0
			time_variances_feature_max[max_feature] += 1
			
			if min_feature not in time_variances_feature_min:
				time_variances_feature_min[min_feature] = 0
			time_variances_feature_min[min_feature] += 1
		
		time_variances_number = collections.OrderedDict(sorted(time_variances_number.items(),reverse=True))
		
		print "time variances distribution "
		print JsonUtils.dict_as_json_str(time_variances_number)
		print "\n \n"
	def transform(self):
		
		sony_activity_counts = self.count_sony_activity_realizations()
		android_activity_counts = self.count_android_activity_realizations()
		
		print "Sony activities duration (in minutes) :"
		print JsonUtils.dict_as_json_str(sony_activity_counts)
		
		print "Android activities duration (in minutes) : "
		print JsonUtils.dict_as_json_str(android_activity_counts)
		
		
		#self.exclusive_sony_activity_transform_one()
		self.exclusive_android_activity_transform_one()
	def extract_realizations_in_time_range_soft_version_optimized_for_sorted_data_copy_verbose(feature_realizations_sorted_copy, realization_key):
		target_realizations = []
		target_time_range = realization_key
			
		for current_time_range in feature_realizations_sorted_copy.keys():
			realization = feature_realizations_sorted_copy[current_time_range]
			
			if DataOperations.is_ended_before_the_start_of(current_time_range, target_time_range):
				#the current realization happened before the target time, so as we assume that the target times that will be given as input to the method are increasing,
				#we just remove this entry
				del feature_realizations_sorted_copy[current_time_range]
				
			elif DataOperations.does_date_overlaps(target_time_range , current_time_range):
				#the current realization has a time that overlaps with the target time so we select it and extend the target time so that it includes the time range of the selected realization(to satisfy the soft version property)
				print "gps "+JsonUtils.dict_as_json_str(realization)+" that occured at time "+current_time_range+" included in the selection"
				target_realizations.append(realization)
				#as the target times are strictly increasing, we delete this entry because it will never match another target time
				del feature_realizations_sorted_copy[current_time_range]
				target_time_range = DataOperations.union_of_date_intervals(target_time_range , current_time_range)
				
			elif DataOperations.is_ended_before_the_start_of(target_time_range , current_time_range):
				#if the current time range started after the finish of the target one, it means that we will not meet any realizations in the target time range any more
				#print current_time_range+ ": CAUSED STOP LOOP \n"
				break;
				
			
		#print "the selected realizations are \n"+JsonUtils.dict_as_json_str(target_realizations)+"\n\n\n\n"
		#print "\n\n\n\n"
		return target_realizations
	def print_times_for_specific_locations(data):
		accuracies = {}
		big = 0 
		small = 0
		for record_id in data:
			if "location" in data[record_id]:
				accuracy = data[record_id]["location"]["accuracy"]
				
				if accuracy not in accuracies:
					accuracies[accuracy] = 0
				
				accuracies[accuracy]+=1
				
				if accuracy>200:
					big+=1
				else:
					small+=1
					
		
		accuracies = collections.OrderedDict(sorted(accuracies.items(),reverse=True))
		print "accuracies for location are : "
		print JsonUtils.dict_as_json_str(accuracies)
		print "there is "+str(big)+" accuracies bigger than 200 meters from a total of "+str(big+small)
		print "\n \n"
Example #8
0
def compare(reference_transformation, user_id):
	global labels_importance
	global labels_importance_rank
	#global labels_importance_derivative
	index = 0
	transformations = transformation_vectors.keys()
	for label in rows_labels:
		labels_importance[label] = {}
		labels_importance_rank[label] = {}
		for transformation in transformations:
			labels_importance[label][transformation]=transformation_vectors[transformation][0][index]
			labels_importance_rank[label][transformation]= transformation_vectors[transformation][1][index]
			#labels_importance_derivative[label][transformation]= transformation_vectors[transformation][2][index]
		
		index +=1
		
		
	#sort the dictionaries per presence rate. The most frequent feature at the biginning
	labels_importance = collections.OrderedDict(sorted(labels_importance.items(), key=lambda x: x[1][reference_transformation], reverse = True))
	#labels_importance_derivative = collections.OrderedDict(sorted(labels_importance_derivative.items(), key=lambda x: x[1][reference_transformation], reverse = True))
	labels_importance_rank = collections.OrderedDict(sorted(labels_importance_rank.items(), key=lambda x: x[1][reference_transformation]))
	
	
	print JsonUtils.dict_as_json_str(labels_importance)
	
	print JsonUtils.dict_as_json_str(labels_importance_rank)
	#print np.shape(data_matrix)
	
	#write the dictionaries into files
	out = LogsFileWriter.open(file_name)
	LogsFileWriter.write(JsonUtils.dict_as_json_str(labels_importance),out)
	LogsFileWriter.write(JsonUtils.dict_as_json_str(labels_importance_rank),out)
	LogsFileWriter.close(out)
	
	
	#plot the records importance vs different transformation scores
	importances_list = []
	importances_legends = []
	ranks_list = []
	ranks_legends = []
	importances_derivatives_list = []
	importances_derivatives_legends = []
	for transformation in transformations:
		importance_list = [importance[transformation] for importance in labels_importance.values()]
		importances_list.append(importance_list)
		importances_legends.append(transformation)
		
		rank_list = [rank[transformation] for rank in labels_importance_rank.values()]
		ranks_list.append(rank_list)
		ranks_legends.append(transformation)
		
		importance_derivative_list = np.diff(np.asarray(importance_list), 1).tolist()
		importances_derivatives_list.append(importance_derivative_list)
		importances_derivatives_legends.append(transformation)
		
		
	importances_derivatives_list.append([0]*len(importances_derivatives_list[0]))
	importances_derivatives_legends.append("y=0")
	PlotlibDrawer.plot_1(labels_importance.keys(), [percentage["presence_percentage"] for percentage in labels_importance.values()], "features rank", "% records", "presence rate of the features in the records", 10)
	PlotlibDrawer.plot_2(labels_importance.keys(), importances_list, importances_legends, "features rank", "features scores", "comparison of different transformation scores "+str(user_id), 11)
	PlotlibDrawer.plot_2(labels_importance_rank.keys(), ranks_list, ranks_legends, "features initial rank", "features rank after transformation", "comparison of different transformation ranks "+str(user_id), 11)
	PlotlibDrawer.plot_2(labels_importance.keys(), importances_derivatives_list, importances_derivatives_legends, "features initial rank", "features scores derivative", "comparison of different transformation scores derivative "+str(user_id), 11)
	
		
	
	
	
		
	
	
	
def transform_to_matrix_one_user(user_id):
	
		
	print "loading data for user "+str(user_id)
	categorized_data = DataExtractor.load_json_data(user_id)
	data = DataExtractor.complete_data(categorized_data)
	metadata = DataExtractor.complete_metadata(categorized_data)
	
	#order the data by the alphabetic name of the features
	print "ordering data "+str(user_id)
	data = collections.OrderedDict(sorted(data.items()))
	
	#get the first date and the last date
	print "getting first date and last date "
	end_date = date_min
	start_date = datetime.now()
	for feature, feature_data in data.iteritems():
		feature_data = collections.OrderedDict(sorted(feature_data.items()))
		begin_date = DataExtractor.start_date_of_realization(feature_data.keys()[0])
		if begin_date < start_date:
			start_date = begin_date
			
		last_date = DataExtractor.start_date_of_realization(feature_data.keys()[len(feature_data.keys())-1])
		if last_date > end_date:
			end_date = last_date
		
		data[feature] = feature_data
	
	#construct the data matrix
	#I- construct the matrices of all the features
	print "constructing the matrixes "
	rows = 0
	
	transformers = {} 
	for feature, feature_date in data.iteritems():
		if feature == "location":
			transformers[feature] = MatrixLocationFeatureTransformer(feature, data[feature], metadata[feature], start_date, end_date, coocurring_precision)
		elif feature == "bluetoothSeen" or feature == "bluetoothPaired":
			transformers[feature] = MatrixBleutoothFeatureTransformer(feature, data[feature], metadata[feature], start_date, end_date, coocurring_precision)
		else :
			transformers[feature] = MatrixFeatureTransformer(feature, data[feature], metadata[feature], start_date, end_date, coocurring_precision)
			
		if feature in features_importance_score_one:
			transformers[feature].let_importance_scores_to_1 = True
		
		transformers[feature].transform()
		rows += transformers[feature].nbdimentions
	
	#construct the time feature
	transformers[MatrixTimeFeatureTransformer.feature_name] = MatrixTimeFeatureTransformer(start_date, end_date, coocurring_precision)
	transformers[MatrixTimeFeatureTransformer.feature_name].transform()
	rows +=  transformers[MatrixTimeFeatureTransformer.feature_name].nbdimentions
	columns = transformers[MatrixTimeFeatureTransformer.feature_name].nbtimeslots
	
	#II-concatenate all the matrices of each feature into one big matrix (do the same for the labels vector)
	print "regrouping the matrixes "
	data_matrix = np.zeros((columns, rows))
	labels_vector = [""]* rows
	dimentions_importance_score = np.zeros(rows)
	transformers = collections.OrderedDict(sorted(transformers.items()))
	
	begin_row_idex = 0
	end_row_index = 0
	for feature, feature_transformer in transformers.iteritems():
		end_row_index = begin_row_idex + feature_transformer.nbdimentions
		data_matrix[:, begin_row_idex:end_row_index] =  feature_transformer.matrix_data
		labels_vector[begin_row_idex:end_row_index] = feature_transformer.labels_vector
		dimentions_importance_score[begin_row_idex:end_row_index]=feature_transformer.realization_importance_score
		begin_row_idex = end_row_index
	
	'''
	The matrix contains a lot of feature vectors that contains 0 in all the features except the time features.
	Those vectors corresponds to the times where any record has been done.
	We want to eliminate those timestamps and their corresponding times
	'''
	time_vector = transformers.values()[0].time_vector
	[data_matrix, time_vector] = eliminate_empty_records(data_matrix, time_vector)
	data_matrix = np.transpose(data_matrix)
	
	print "the labels are : "
	print JsonUtils.dict_as_json_str(labels_vector)
	
	
	print "first date of observation "+str(start_date)
	print "first date of observation "+str(end_date)
	print "dimension of the labels (features) vector : "+str(len(labels_vector))
	print "dimension of the time vector : "+str(len(time_vector))
	print "dimension of the resulted matrix (features, time) "+str(data_matrix.shape)
	print "the number of non zeros values is : "+str(np.count_nonzero(data_matrix))+"/"+str(np.size(data_matrix))
	print "the number of negative values in the matrix is : "+str(np.size(ma.masked_array(data_matrix, mask=(data_matrix>=0)).compressed()))
	print "the data matrix printed : "
	print Numpy.str(data_matrix)
	
	#write the matrix data
	MDataExtractor.save_matrix(user_id, data_matrix)
	
	#write the labels vector, then the time vector and the importance scores
	MDataExtractor.save_labels_vector(user_id, labels_vector)
	MDataExtractor.save_time_vector(user_id, time_vector)
	MDataExtractor.save_importance_scores(user_id, dimentions_importance_score)