def transform_to_matrix_one_user(user_id): print "loading data for user "+str(user_id) categorized_data = DataExtractor.load_json_data(user_id) data = DataExtractor.complete_data(categorized_data) metadata = DataExtractor.complete_metadata(categorized_data) #order the data by the alphabetic name of the features print "ordering data "+str(user_id) data = collections.OrderedDict(sorted(data.items())) #get the first date and the last date print "getting first date and last date " end_date = date_min start_date = datetime.now() for feature, feature_data in data.iteritems(): feature_data = collections.OrderedDict(sorted(feature_data.items())) begin_date = DataExtractor.start_date_of_realization(feature_data.keys()[0]) if begin_date < start_date: start_date = begin_date last_date = DataExtractor.start_date_of_realization(feature_data.keys()[len(feature_data.keys())-1]) if last_date > end_date: end_date = last_date data[feature] = feature_data #construct the data matrix #I- construct the matrices of all the features print "constructing the matrixes " rows = 0 transformers = {} for feature, feature_date in data.iteritems(): if feature == "location": transformers[feature] = MatrixLocationFeatureTransformer(feature, data[feature], metadata[feature], start_date, end_date, coocurring_precision) elif feature == "bluetoothSeen" or feature == "bluetoothPaired": transformers[feature] = MatrixBleutoothFeatureTransformer(feature, data[feature], metadata[feature], start_date, end_date, coocurring_precision) else : transformers[feature] = MatrixFeatureTransformer(feature, data[feature], metadata[feature], start_date, end_date, coocurring_precision) if feature in features_importance_score_one: transformers[feature].let_importance_scores_to_1 = True transformers[feature].transform() rows += transformers[feature].nbdimentions #construct the time feature transformers[MatrixTimeFeatureTransformer.feature_name] = MatrixTimeFeatureTransformer(start_date, end_date, coocurring_precision) transformers[MatrixTimeFeatureTransformer.feature_name].transform() rows += transformers[MatrixTimeFeatureTransformer.feature_name].nbdimentions columns = transformers[MatrixTimeFeatureTransformer.feature_name].nbtimeslots #II-concatenate all the matrices of each feature into one big matrix (do the same for the labels vector) print "regrouping the matrixes " data_matrix = np.zeros((columns, rows)) labels_vector = [""]* rows dimentions_importance_score = np.zeros(rows) transformers = collections.OrderedDict(sorted(transformers.items())) begin_row_idex = 0 end_row_index = 0 for feature, feature_transformer in transformers.iteritems(): end_row_index = begin_row_idex + feature_transformer.nbdimentions data_matrix[:, begin_row_idex:end_row_index] = feature_transformer.matrix_data labels_vector[begin_row_idex:end_row_index] = feature_transformer.labels_vector dimentions_importance_score[begin_row_idex:end_row_index]=feature_transformer.realization_importance_score begin_row_idex = end_row_index ''' The matrix contains a lot of feature vectors that contains 0 in all the features except the time features. Those vectors corresponds to the times where any record has been done. We want to eliminate those timestamps and their corresponding times ''' time_vector = transformers.values()[0].time_vector [data_matrix, time_vector] = eliminate_empty_records(data_matrix, time_vector) data_matrix = np.transpose(data_matrix) print "the labels are : " print JsonUtils.dict_as_json_str(labels_vector) print "first date of observation "+str(start_date) print "first date of observation "+str(end_date) print "dimension of the labels (features) vector : "+str(len(labels_vector)) print "dimension of the time vector : "+str(len(time_vector)) print "dimension of the resulted matrix (features, time) "+str(data_matrix.shape) print "the number of non zeros values is : "+str(np.count_nonzero(data_matrix))+"/"+str(np.size(data_matrix)) print "the number of negative values in the matrix is : "+str(np.size(ma.masked_array(data_matrix, mask=(data_matrix>=0)).compressed())) print "the data matrix printed : " print Numpy.str(data_matrix) #write the matrix data MDataExtractor.save_matrix(user_id, data_matrix) #write the labels vector, then the time vector and the importance scores MDataExtractor.save_labels_vector(user_id, labels_vector) MDataExtractor.save_time_vector(user_id, time_vector) MDataExtractor.save_importance_scores(user_id, dimentions_importance_score)
import os.path import matplotlib.pyplot as plt sys.path.insert(0, "/home/dehajjik/workspace/src/utils") from location_time_coverage_one_user import location_time_coverage_one_user as tc_categorized sys.path.insert(0, "/home/dehajjik/workspace/src/clean_data_exploration") from location_time_coverage_one_user_clean import location_time_coverage_one_user_clean as tc_clean from plot_lib_utils import * from numpy_utils import * from categorized_data_utils import DataExtractor from plot_lib_utils import * coverage_cat = np.zeros(len(DataExtractor.users_ids_list())) coverage_clean = np.zeros(len(DataExtractor.users_ids_list())) i = 0 for user_id in DataExtractor.users_ids_list(): coverage_cat[i] = tc_categorized(user_id) coverage_clean[i] = tc_clean(user_id) i += 1 print("user " + str(user_id) + " extracted") print coverage_cat print coverage_clean fig, ax = plt.subplots()
def transform_to_rfv_one_user(user_id): print "loading data for user "+str(user_id) categorized_data = DataExtractor.load_json_data(user_id) data = DataExtractor.complete_data(categorized_data) metadata = DataExtractor.complete_metadata(categorized_data) #order the data by the alphabetic name of the features print "ordering data "+str(user_id) data = collections.OrderedDict(sorted(data.items())) #get the first date and the last date print "getting first date and last date " end_date = date_min start_date = datetime.now() for feature, feature_data in data.iteritems(): feature_data = collections.OrderedDict(sorted(feature_data.items())) begin_date = DataExtractor.start_date_of_realization(feature_data.keys()[0]) if begin_date < start_date: start_date = begin_date last_date = DataExtractor.start_date_of_realization(feature_data.keys()[len(feature_data.keys())-1]) if last_date > end_date: end_date = last_date data[feature] = feature_data #construct the values data #I- construct the values for all the features print "constructing the values data" transformers = {} features_name = [] records = [] values_name = {} for feature, feature_date in data.iteritems(): if feature == "location": transformers[feature] = ValuesFeatureTransformer(MatrixLocationFeatureTransformer, feature, data[feature], metadata[feature], start_date, end_date, coocurring_precision) elif feature == "bluetoothSeen" or feature == "bluetoothPaired": transformers[feature] = ValuesFeatureTransformer(MatrixBleutoothFeatureTransformer, feature, data[feature], metadata[feature], start_date, end_date, coocurring_precision) else : transformers[feature] = ValuesFeatureTransformer(MatrixFeatureTransformer, feature, data[feature], metadata[feature], start_date, end_date, coocurring_precision) transformers[feature].transform() features_name.append(feature) values_name[features_name.index(feature)] = transformers[feature].values_labels #construct the time feature feature = "time" timetrans = ValuesTimeFeatureTransformer(MatrixTimeFeatureTransformer, feature, start_date, end_date, coocurring_precision) timetrans.transform() transformers[ValuesTimeFeatureTransformer.day_label] = timetrans transformers[ValuesTimeFeatureTransformer.hour_label] = timetrans features_name.append(ValuesTimeFeatureTransformer.day_label) values_name[features_name.index(ValuesTimeFeatureTransformer.day_label)] = timetrans.day_values_labels features_name.append(ValuesTimeFeatureTransformer.hour_label) values_name[features_name.index(ValuesTimeFeatureTransformer.hour_label)] = timetrans.time_values_labels records_labels = timetrans.records_dates records_nb = len(records_labels) #make space for records for r in range(records_nb): records.append({}) #II-fill the records for fid, fname in enumerate(features_name): if fname == ValuesTimeFeatureTransformer.day_label: for r in range(records_nb): if transformers[fname].day_values_data[r]!= []: records[r][fid] = transformers[fname].day_values_data[r]; elif fname == ValuesTimeFeatureTransformer.hour_label: for r in range(records_nb): if transformers[fname].time_values_data[r]!= []: records[r][fid] = transformers[fname].time_values_data[r]; else: for r in range(records_nb): if transformers[fname].values_data[r]!= []: records[r][fid] = transformers[fname].values_data[r]; #remove the ones that only contain value for the time feature for r in range(records_nb-1, -1, -1): #Decreasing loop over the records so that remove is possible if len(records[r]) <= 2: del records[r] #for the remaining records, add non_present values for the non_persistant features that are not present in each record. non_persistant for nf in nonpersistent_features: #add the non_present value as a value that can be taken by the non persistent features if nf in features_name: nfid = features_name.index(nf) values_name[nfid].append(nonpresent_v) rtv_data = {} for idr, r in enumerate(records): for nf in nonpersistent_features: if nf in features_name: nfid = features_name.index(nf) if nfid not in r: r[nfid]=[values_name[nfid].index(nonpresent_v)]; rtv_data[idr]=r print "first date of observation "+str(start_date) print "first date of observation "+str(end_date) print "features names "+str(features_name) print "values names : "+str(values_name) print "number of records "+str(len(rtv_data)) #write the data, the record dates, the feature names and the value names RVFDataExtractor.save_rvf(user_id, rtv_data, features_name, values_name, records_labels)
''' for each user, outputs statistics that tests the consistency of the locations extracted: -show the distribution of frequencies of the clusters -show the distribution of the most frequent locations by hour of the day ''' #!/usr/bin/env python import sys import pprint as pp import os.path sys.path.insert(0, "/home/dehajjik/workspace/src/utils") from location_distribution_per_hour_one_user import location_distribution_per_hour_one_user as ldphou from location_visits_distribution_one_user import location_visits_distribution_one_user as lvdou from categorized_data_utils import DataExtractor from plot_lib_utils import * for user_id in DataExtractor.users_ids_list(): for option in ["week_end", "week_days", "all"]: ldphou(user_id,option) lvdou(user_id) print("user "+str(user_id)+" extracted") PlotlibDrawer.show()
sys.path.insert(0, "/home/dehajjik/workspace/src/utils") from location_time_coverage_one_user import location_time_coverage_one_user as tc_categorized sys.path.insert(0, "/home/dehajjik/workspace/src/clean_data_exploration") from location_time_coverage_one_user_clean import location_time_coverage_one_user_clean as tc_clean from plot_lib_utils import * from numpy_utils import * from categorized_data_utils import DataExtractor from plot_lib_utils import * coverage_cat = np.zeros(len(DataExtractor.users_ids_list())) coverage_clean = np.zeros(len(DataExtractor.users_ids_list())) i = 0 for user_id in DataExtractor.users_ids_list(): coverage_cat[i] = tc_categorized(user_id) coverage_clean[i] = tc_clean(user_id) i+=1 print("user "+str(user_id)+" extracted") print coverage_cat print coverage_clean