def transform_to_matrix_one_user(user_id):
	print "loading data for user "+str(user_id)
	categorized_data = DataExtractor.load_json_data(user_id)
	data = DataExtractor.complete_data(categorized_data)
	metadata = DataExtractor.complete_metadata(categorized_data)
	#order the data by the alphabetic name of the features
	print "ordering data "+str(user_id)
	data = collections.OrderedDict(sorted(data.items()))
	#get the first date and the last date
	print "getting first date and last date "
	end_date = date_min
	start_date =
	for feature, feature_data in data.iteritems():
		feature_data = collections.OrderedDict(sorted(feature_data.items()))
		begin_date = DataExtractor.start_date_of_realization(feature_data.keys()[0])
		if begin_date < start_date:
			start_date = begin_date
		last_date = DataExtractor.start_date_of_realization(feature_data.keys()[len(feature_data.keys())-1])
		if last_date > end_date:
			end_date = last_date
		data[feature] = feature_data
	#construct the data matrix
	#I- construct the matrices of all the features
	print "constructing the matrixes "
	rows = 0
	transformers = {} 
	for feature, feature_date in data.iteritems():
		if feature == "location":
			transformers[feature] = MatrixLocationFeatureTransformer(feature, data[feature], metadata[feature], start_date, end_date, coocurring_precision)
		elif feature == "bluetoothSeen" or feature == "bluetoothPaired":
			transformers[feature] = MatrixBleutoothFeatureTransformer(feature, data[feature], metadata[feature], start_date, end_date, coocurring_precision)
		else :
			transformers[feature] = MatrixFeatureTransformer(feature, data[feature], metadata[feature], start_date, end_date, coocurring_precision)
		if feature in features_importance_score_one:
			transformers[feature].let_importance_scores_to_1 = True
		rows += transformers[feature].nbdimentions
	#construct the time feature
	transformers[MatrixTimeFeatureTransformer.feature_name] = MatrixTimeFeatureTransformer(start_date, end_date, coocurring_precision)
	rows +=  transformers[MatrixTimeFeatureTransformer.feature_name].nbdimentions
	columns = transformers[MatrixTimeFeatureTransformer.feature_name].nbtimeslots
	#II-concatenate all the matrices of each feature into one big matrix (do the same for the labels vector)
	print "regrouping the matrixes "
	data_matrix = np.zeros((columns, rows))
	labels_vector = [""]* rows
	dimentions_importance_score = np.zeros(rows)
	transformers = collections.OrderedDict(sorted(transformers.items()))
	begin_row_idex = 0
	end_row_index = 0
	for feature, feature_transformer in transformers.iteritems():
		end_row_index = begin_row_idex + feature_transformer.nbdimentions
		data_matrix[:, begin_row_idex:end_row_index] =  feature_transformer.matrix_data
		labels_vector[begin_row_idex:end_row_index] = feature_transformer.labels_vector
		begin_row_idex = end_row_index
	The matrix contains a lot of feature vectors that contains 0 in all the features except the time features.
	Those vectors corresponds to the times where any record has been done.
	We want to eliminate those timestamps and their corresponding times
	time_vector = transformers.values()[0].time_vector
	[data_matrix, time_vector] = eliminate_empty_records(data_matrix, time_vector)
	data_matrix = np.transpose(data_matrix)
	print "the labels are : "
	print JsonUtils.dict_as_json_str(labels_vector)
	print "first date of observation "+str(start_date)
	print "first date of observation "+str(end_date)
	print "dimension of the labels (features) vector : "+str(len(labels_vector))
	print "dimension of the time vector : "+str(len(time_vector))
	print "dimension of the resulted matrix (features, time) "+str(data_matrix.shape)
	print "the number of non zeros values is : "+str(np.count_nonzero(data_matrix))+"/"+str(np.size(data_matrix))
	print "the number of negative values in the matrix is : "+str(np.size(ma.masked_array(data_matrix, mask=(data_matrix>=0)).compressed()))
	print "the data matrix printed : "
	print Numpy.str(data_matrix)
	#write the matrix data
	MDataExtractor.save_matrix(user_id, data_matrix)
	#write the labels vector, then the time vector and the importance scores
	MDataExtractor.save_labels_vector(user_id, labels_vector)
	MDataExtractor.save_time_vector(user_id, time_vector)
	MDataExtractor.save_importance_scores(user_id, dimentions_importance_score)
import os.path
import matplotlib.pyplot as plt

sys.path.insert(0, "/home/dehajjik/workspace/src/utils")
from location_time_coverage_one_user import location_time_coverage_one_user as tc_categorized

sys.path.insert(0, "/home/dehajjik/workspace/src/clean_data_exploration")
from location_time_coverage_one_user_clean import location_time_coverage_one_user_clean as tc_clean

from plot_lib_utils import *
from numpy_utils import *

from categorized_data_utils import DataExtractor
from plot_lib_utils import *

coverage_cat = np.zeros(len(DataExtractor.users_ids_list()))
coverage_clean = np.zeros(len(DataExtractor.users_ids_list()))

i = 0
for user_id in DataExtractor.users_ids_list():
    coverage_cat[i] = tc_categorized(user_id)
    coverage_clean[i] = tc_clean(user_id)

    i += 1

    print("user " + str(user_id) + " extracted")

print coverage_cat
print coverage_clean
fig, ax = plt.subplots()
def transform_to_rfv_one_user(user_id):
	print "loading data for user "+str(user_id)
	categorized_data = DataExtractor.load_json_data(user_id)
	data = DataExtractor.complete_data(categorized_data)
	metadata = DataExtractor.complete_metadata(categorized_data)
	#order the data by the alphabetic name of the features
	print "ordering data "+str(user_id)
	data = collections.OrderedDict(sorted(data.items()))
	#get the first date and the last date
	print "getting first date and last date "
	end_date = date_min
	start_date =
	for feature, feature_data in data.iteritems():
		feature_data = collections.OrderedDict(sorted(feature_data.items()))
		begin_date = DataExtractor.start_date_of_realization(feature_data.keys()[0])
		if begin_date < start_date:
			start_date = begin_date
		last_date = DataExtractor.start_date_of_realization(feature_data.keys()[len(feature_data.keys())-1])
		if last_date > end_date:
			end_date = last_date
		data[feature] = feature_data
	#construct the values data
	#I- construct the values for all the features
	print "constructing the values data"
	transformers = {} 
	features_name = []
	records = []
	values_name = {}
	for feature, feature_date in data.iteritems():
		if feature == "location":
			transformers[feature] = ValuesFeatureTransformer(MatrixLocationFeatureTransformer, feature, data[feature], metadata[feature], start_date, end_date, coocurring_precision)
		elif feature == "bluetoothSeen" or feature == "bluetoothPaired":
			transformers[feature] = ValuesFeatureTransformer(MatrixBleutoothFeatureTransformer, feature, data[feature], metadata[feature], start_date, end_date, coocurring_precision)
		else :
			transformers[feature] = ValuesFeatureTransformer(MatrixFeatureTransformer, feature, data[feature], metadata[feature], start_date, end_date, coocurring_precision)
		values_name[features_name.index(feature)] = transformers[feature].values_labels
	#construct the time feature
	feature = "time"
	timetrans = ValuesTimeFeatureTransformer(MatrixTimeFeatureTransformer, feature, start_date, end_date, coocurring_precision)
	transformers[ValuesTimeFeatureTransformer.day_label] =  timetrans
	transformers[ValuesTimeFeatureTransformer.hour_label] =  timetrans
	values_name[features_name.index(ValuesTimeFeatureTransformer.day_label)] = timetrans.day_values_labels
	values_name[features_name.index(ValuesTimeFeatureTransformer.hour_label)] = timetrans.time_values_labels
	records_labels =  timetrans.records_dates
	records_nb = len(records_labels)
	#make space for records
	for r in range(records_nb):
	#II-fill the records
	for fid, fname in enumerate(features_name):
		if fname == ValuesTimeFeatureTransformer.day_label:
			for r in range(records_nb):
				if transformers[fname].day_values_data[r]!= []: records[r][fid] = transformers[fname].day_values_data[r];
		elif fname == ValuesTimeFeatureTransformer.hour_label:
			for r in range(records_nb):
				if transformers[fname].time_values_data[r]!= []: records[r][fid] = transformers[fname].time_values_data[r];
			for r in range(records_nb):
				if transformers[fname].values_data[r]!= []: records[r][fid] = transformers[fname].values_data[r];
	#remove the ones that only contain value for the time feature 
	for r in range(records_nb-1, -1, -1): #Decreasing loop over the records so that remove is possible
		if len(records[r]) <= 2:
			del records[r]
	#for the remaining records, add non_present values for the non_persistant features that are not present in each record. non_persistant
	for nf in nonpersistent_features: #add the non_present value as a value that can be taken by the non persistent features
		if nf in features_name: 
			nfid = features_name.index(nf)
	rtv_data = {}
	for idr, r in enumerate(records):
		for nf in nonpersistent_features:
			if nf in features_name:
				nfid = features_name.index(nf)
				if nfid not in r: r[nfid]=[values_name[nfid].index(nonpresent_v)];
	print "first date of observation "+str(start_date)
	print "first date of observation "+str(end_date)
	print "features names "+str(features_name)
	print "values names : "+str(values_name)
	print "number of records "+str(len(rtv_data))
	#write the data, the record dates, the feature names and the value names
	RVFDataExtractor.save_rvf(user_id, rtv_data, features_name, values_name, records_labels)

예제 #4
for each user, outputs statistics that tests the consistency of the locations extracted:
	-show the distribution of frequencies of the clusters
	-show the distribution of the most frequent locations by hour of the day

#!/usr/bin/env python
import sys
import pprint as pp
import os.path

sys.path.insert(0, "/home/dehajjik/workspace/src/utils")
from location_distribution_per_hour_one_user import location_distribution_per_hour_one_user as ldphou
from location_visits_distribution_one_user import location_visits_distribution_one_user as lvdou
from categorized_data_utils import DataExtractor
from plot_lib_utils import *

for user_id in DataExtractor.users_ids_list():
	for option in ["week_end", "week_days", "all"]:
	print("user "+str(user_id)+" extracted")
sys.path.insert(0, "/home/dehajjik/workspace/src/utils")
from location_time_coverage_one_user import location_time_coverage_one_user as tc_categorized

sys.path.insert(0, "/home/dehajjik/workspace/src/clean_data_exploration")
from location_time_coverage_one_user_clean import location_time_coverage_one_user_clean as tc_clean

from plot_lib_utils import *
from numpy_utils import *

from categorized_data_utils import DataExtractor
from plot_lib_utils import *

coverage_cat = np.zeros(len(DataExtractor.users_ids_list()))
coverage_clean = np.zeros(len(DataExtractor.users_ids_list()))

i = 0
for user_id in DataExtractor.users_ids_list():
	coverage_cat[i] = tc_categorized(user_id)
	coverage_clean[i] = tc_clean(user_id)

	print("user "+str(user_id)+" extracted")

print coverage_cat
print coverage_clean