def compute_ghcm_mdt_one_user(user_id): file_name = "ghcm_mdt_user_" + str(user_id) print "loading matrix user " + str(user_id) + "..." [rfvdata, featuresnames, valuesnames, recordsdates] = RVFDataExtractor.load_rvf(user_id) print "values" + JsonUtils.dict_as_json_str(valuesnames) print "data" + JsonUtils.dict_as_json_str(rfvdata[0]) vocab_size = [len(valuesnames[f]) for f in range(len(valuesnames.keys()))] print "user " + str(user_id) + " has " + str( len(featuresnames)) + " features and " + str(len(rfvdata)) + " records" print "features names" print featuresnames print "values" + JsonUtils.dict_as_json_str(valuesnames) for k in [10, 20, 30]: #compute the ghcm_mdt ghcm_mdt_comp = GHCM_MDTComputer(rfvdata, k, vocab_size) print "computing SVD for user " + str(user_id) + "..." ghcm_mdt_comp.compute() print "constructing interpretable output for user " + str( user_id) + "..." ghcm_mdt_comp.construct_rows_interpretable_output( featuresnames, valuesnames, disp_m) r_output = ghcm_mdt_comp.rows_interpretable_output #write the result print "writing SVD result for user " + str(user_id) + "..." JsonLogsFileWriter.write(r_output, file_name)
def transform_to_rv_one_user(user_id): print "loading data for user " + str(user_id) [rfv, featuresnames, valuesnames, recordsdates] = RVFDataExtractor.load_rvf(user_id) realization_names = [] rvid_table = dict() id = 0 for f in valuesnames: f_name = featuresnames[f] rvid_table[f] = dict() for v, v_name in enumerate(valuesnames[f]): realization_names += [f_name + "__" + v_name] rvid_table[f][v] = id id += 1 rv_data = dict() for r in rfv: rv_data[r] = [] for f in rfv[r]: for v in rfv[r][f]: rv_data[r] += [rvid_table[f][v]] #pdb.set_trace() RVDataExtractor.save_rv(user_id, rv_data, realization_names, recordsdates)
def compute_ghcm_mdt_one_user(user_id): file_name = "ghcm_mdt_user_"+str(user_id) print "loading matrix user "+str(user_id)+"..." [rfvdata, featuresnames, valuesnames, recordsdates] = RVFDataExtractor.load_rvf(user_id) print "values" + JsonUtils.dict_as_json_str(valuesnames) print "data" + JsonUtils.dict_as_json_str(rfvdata[0]) vocab_size = [len(valuesnames[f]) for f in range(len(valuesnames.keys()))] print "user "+str(user_id)+" has "+str(len(featuresnames))+" features and "+str(len(rfvdata))+" records" print "features names" print featuresnames print "values" + JsonUtils.dict_as_json_str(valuesnames) for k in [10,20,30]: #compute the ghcm_mdt ghcm_mdt_comp = GHCM_MDTComputer(rfvdata, k, vocab_size) print "computing SVD for user "+str(user_id)+"..." ghcm_mdt_comp.compute() print "constructing interpretable output for user "+str(user_id)+"..." ghcm_mdt_comp.construct_rows_interpretable_output(featuresnames, valuesnames, disp_m) r_output = ghcm_mdt_comp.rows_interpretable_output #write the result print "writing SVD result for user "+str(user_id)+"..." JsonLogsFileWriter.write(r_output, file_name)
def transform_to_rv_one_user(user_id): print "loading data for user "+str(user_id) [rfv, featuresnames, valuesnames, recordsdates] = RVFDataExtractor.load_rvf(user_id) realization_names = [] rvid_table = dict() id = 0 for f in valuesnames: f_name = featuresnames[f] rvid_table[f]=dict() for v, v_name in enumerate(valuesnames[f]): realization_names += [f_name+"__"+v_name] rvid_table[f][v]=id id+=1 rv_data = dict() for r in rfv: rv_data[r] = [] for f in rfv[r]: for v in rfv[r][f]: rv_data[r] += [rvid_table[f][v]] #pdb.set_trace() RVDataExtractor.save_rv(user_id, rv_data, realization_names, recordsdates)
def evaluation_on_day_prediction_one_user(user_id): print "loading matrix user "+str(user_id)+"..." data_matrix = MDataExtractor.load_matrix(user_id) rows_labels = MDataExtractor.load_labels_vector(user_id) columns_labels = MDataExtractor.load_time_vector(user_id) importance_scores = MDataExtractor.load_importance_scores(user_id) print "user "+str(user_id)+" has "+str(len(rows_labels))+" features (rows) and "+str(len(columns_labels))+" realization (columns)" print "loading rfv user "+str(user_id)+"..." [rfvdata, featuresnames, valuesnames, recordsdates] = RVFDataExtractor.load_rvf(user_id) vocab_size = [len(valuesnames[f]) for f in range(len(valuesnames.keys()))] print "user "+str(user_id)+" has "+str(len(rfvdata))+" records" print "loading rv user "+str(user_id)+"..." [rv_data, realizationsnames, recordsdates] = RVDataExtractor.load_rv(user_id) realization_size = len(realizationsnames) classifiers = {"Random ": RandomClassifier()}#add the random classifier classifiers = {"Most Frequent ": SVDClassifier("idf", 1)}#add the most frequent classifier ''' for k in range(1,21): classifiers["GHCM_MDT k="+str(k)] = GHCM_MDT_Value_Classifier(k, vocab_size) #add a ghcmdt classifier for each k classifiers["LCBMF k="+str(k)] = LCBMFClassifier(None, None, None, k) #add a lcbmf classifier for each k #add a svd classifier for each k''' '''for k in [1,10,20,30,40,50,60]: c =GHCM_MDT_Value_Classifier(k, vocab_size) classifiers["GHCM_MDT k="+str(k)] = c''' for k in [1,10,20,30,50,70]: classifiers["SVD k="+str(k)] = SVDClassifier("idf", k) classifiers["LDA k="+str(k)] = LDA_Value_Classifier(k, realization_size) classifiers["HCM_MDT k="+str(k)] = HCM_MDT_Value_Classifier(k, vocab_size) classifiers["GHCM_MDT k="+str(k)] = GHCM_MDT_Value_Classifier(k, vocab_size) classifiers["LCBMF k="+str(k)] = LCBMFClassifier(None, None, None, k) eval_handler = FpEvaluatorHandler() eval_handler.instanciate_m (MDayPredictionEvaluator, data_matrix, rows_labels, columns_labels) eval_handler.instanciate_rvf (RfvDayPredictionEvaluator, rfvdata, featuresnames, valuesnames) eval_handler.instantiate_rv(RvDayPredictionEvaluator,rv_data, realizationsnames) evaluations = {} nb=1 for name, classifier in classifiers.iteritems(): print "evaluating "+name+" on user "+str(user_id)+" nb "+str(nb)+"/"+str(len(classifiers)) #results = [good_predictions, total_predictions, accuracy, macro_average_acc_by_class, accuracy_class1,...,accuracy_classn] results = eval_handler.evaluate(classifier) evaluations[name] = results nb+=1 classes = eval_handler.classes return [evaluations, classes]
def evaluation_on_applaunch_prediction_one_user(user_id): print "loading rfv user " + str(user_id) + "..." [rfvdata, featuresnames, valuesnames, recordsdates] = RVFDataExtractor.load_rvf(user_id) vocab_size = [len(valuesnames[f]) for f in range(len(valuesnames.keys()))] print "user " + str(user_id) + " has " + str(len(rfvdata)) + " records" print "loading rv user " + str(user_id) + "..." [rv_data, realizationsnames, recordsdates] = RVDataExtractor.load_rv(user_id) realization_size = len(realizationsnames) classifiers = {"Random ": RandomClassifier()} #add the random classifier classifiers = { "Most Frequent ": HCM_MDT_Value_Classifier(1, vocab_size) } #add the most frequent classifier ''' for k in range(1,21): classifiers["GHCM_MDT k="+str(k)] = GHCM_MDT_Value_Classifier(k, vocab_size) #add a ghcmdt classifier for each k classifiers["LCBMF k="+str(k)] = LCBMFClassifier(None, None, None, k) #add a lcbmf classifier for each k #add a svd classifier for each k''' '''for k in [1,10,20,30,40,50,60]: c =GHCM_MDT_Value_Classifier(k, vocab_size) classifiers["GHCM_MDT k="+str(k)] = c''' for k in [1, 10, 20, 30, 50, 70]: classifiers["LDA k=" + str(k)] = LDA_Value_Classifier( k, realization_size) classifiers["GHCM_MDT k=" + str(k)] = GHCM_MDT_Value_Classifier( k, vocab_size) classifiers["HCM_MDT k=" + str(k)] = HCM_MDT_Value_Classifier( k, vocab_size) eval_handler = FpEvaluatorHandler() eval_handler.instanciate_rvf(RfvApplaunchPredictionEvaluator, rfvdata, featuresnames, valuesnames) eval_handler.instantiate_rv(RvApplaunchPredictionEvaluator, rv_data, realizationsnames) evaluations = {} nb = 1 for name, classifier in classifiers.iteritems(): print "evaluating " + name + " on user " + str(user_id) + " nb " + str( nb) + "/" + str(len(classifiers)) #results = [good_predictions, total_predictions, accuracy, macro_average_acc_by_class, accuracy_class1,...,accuracy_classn] results = eval_handler.evaluate(classifier) evaluations[name] = results nb += 1 classes = eval_handler.classes return [evaluations, classes]
def evaluation_perplexity_one_user(user_id): print "loading rfv user "+str(user_id)+"..." [rfvdata, featuresnames, valuesnames, recordsdates] = RVFDataExtractor.load_rvf(user_id) vocab_size = [len(valuesnames[f]) for f in range(len(valuesnames.keys()))] print "user "+str(user_id)+" has "+str(len(rfvdata))+" records" classifiers={} ''' for k in range(1,21): classifiers["GHCM_MDT k="+str(k)] = GHCM_MDT_Value_Classifier(k, vocab_size) #add a ghcmdt classifier for each k classifiers["LCBMF k="+str(k)] = LCBMFClassifier(None, None, None, k) #add a lcbmf classifier for each k classifiers["SVD k="+str(k)] = SVDClassifier("idf", k) #add a svd classifier for each k''' k = 8 for k in [1,3,5,7,10]: #classifiers["GHCM_MDT k="+str(k)] = GHCM_MDT_Value_Classifier(k, vocab_size) classifiers["HCM_MDT k="+str(k)] = HCM_MDT_Value_Classifier(k, vocab_size) eval_handler = PerpEvaluatorHandler() #eval_handler.instanciate_rv (MLocationPredictionEvaluator, data_matrix, rows_labels, columns_labels) eval_handler.instanciate_rvf (RfvPerplexityEvaluator, rfvdata, featuresnames, valuesnames) evaluations = {} for name, classifier in classifiers.iteritems(): print "evaluating "+name+" on user "+str(user_id) #results = [good_predictions, total_predictions, accuracy, macro_average_acc_by_class, accuracy_class1,...,accuracy_classn] results = eval_handler.evaluate(classifier) evaluations[name] = results print "results for "+name+" on user "+str(user_id)+" : "+str(results) return evaluations
def evaluation_on_location_prediction_one_user(user_id): print "loading matrix user " + str(user_id) + "..." data_matrix = MDataExtractor.load_matrix(user_id) rows_labels = MDataExtractor.load_labels_vector(user_id) columns_labels = MDataExtractor.load_time_vector(user_id) importance_scores = MDataExtractor.load_importance_scores(user_id) print "user " + str(user_id) + " has " + str( len(rows_labels)) + " features (rows) and " + str( len(columns_labels)) + " realization (columns)" print "loading rfv user " + str(user_id) + "..." [rfvdata, featuresnames, valuesnames, recordsdates] = RVFDataExtractor.load_rvf(user_id) vocab_size = [len(valuesnames[f]) for f in range(len(valuesnames.keys()))] print "user " + str(user_id) + " has " + str(len(rfvdata)) + " records" print "loading rv user " + str(user_id) + "..." [rv_data, realizationsnames, recordsdates] = RVDataExtractor.load_rv(user_id) realization_size = len(realizationsnames) classifiers = {"Random ": RandomClassifier()} #add the random classifier classifiers = { "Most Frequent ": SVDClassifier("idf", 1) } #add the most frequent classifier ''' for k in range(1,21): classifiers["GHCM_MDT k="+str(k)] = GHCM_MDT_Value_Classifier(k, vocab_size) #add a ghcmdt classifier for each k classifiers["LCBMF k="+str(k)] = LCBMFClassifier(None, None, None, k) #add a lcbmf classifier for each k classifiers["SVD k="+str(k)] = SVDClassifier("idf", k) #add a svd classifier for each k''' '''for k in [1,10,20,30,40,50,60]: c =GHCM_MDT_Value_Classifier(k, vocab_size) classifiers["GHCM_MDT k="+str(k)] = c''' '''for k in [10,15,20,25,30,35]: c =GHCM_MDT_Value_Classifier(k, vocab_size) classifiers["GHCM_MDT k="+str(k)] = c''' for k in [1, 10, 20, 30, 50, 70]: classifiers["SVD k=" + str(k)] = SVDClassifier("idf", k) classifiers["LDA k=" + str(k)] = LDA_Value_Classifier( k, realization_size) classifiers["HCM_MDT k=" + str(k)] = HCM_MDT_Value_Classifier( k, vocab_size) classifiers["GHCM_MDT k=" + str(k)] = GHCM_MDT_Value_Classifier( k, vocab_size) classifiers["LCBMF k=" + str(k)] = LCBMFClassifier(None, None, None, k) eval_handler = FpEvaluatorHandler() eval_handler.instanciate_m(MLocationPredictionEvaluator, data_matrix, rows_labels, columns_labels) eval_handler.instanciate_rvf(RfvLocationPredictionEvaluator, rfvdata, featuresnames, valuesnames) eval_handler.instantiate_rv(RvLocationPredictionEvaluator, rv_data, realizationsnames) evaluations = {} for name, classifier in classifiers.iteritems(): print "evaluating " + name + " on user " + str(user_id) #results = [good_predictions, total_predictions, accuracy, macro_average_acc_by_class, accuracy_class1,...,accuracy_classn] results = eval_handler.evaluate(classifier) evaluations[name] = results classes = eval_handler.classes return [evaluations, classes]
def transform_to_rfv_one_user(user_id): print "loading data for user "+str(user_id) categorized_data = DataExtractor.load_json_data(user_id) data = DataExtractor.complete_data(categorized_data) metadata = DataExtractor.complete_metadata(categorized_data) #order the data by the alphabetic name of the features print "ordering data "+str(user_id) data = collections.OrderedDict(sorted(data.items())) #get the first date and the last date print "getting first date and last date " end_date = date_min start_date = datetime.now() for feature, feature_data in data.iteritems(): feature_data = collections.OrderedDict(sorted(feature_data.items())) begin_date = DataExtractor.start_date_of_realization(feature_data.keys()[0]) if begin_date < start_date: start_date = begin_date last_date = DataExtractor.start_date_of_realization(feature_data.keys()[len(feature_data.keys())-1]) if last_date > end_date: end_date = last_date data[feature] = feature_data #construct the values data #I- construct the values for all the features print "constructing the values data" transformers = {} features_name = [] records = [] values_name = {} for feature, feature_date in data.iteritems(): if feature == "location": transformers[feature] = ValuesFeatureTransformer(MatrixLocationFeatureTransformer, feature, data[feature], metadata[feature], start_date, end_date, coocurring_precision) elif feature == "bluetoothSeen" or feature == "bluetoothPaired": transformers[feature] = ValuesFeatureTransformer(MatrixBleutoothFeatureTransformer, feature, data[feature], metadata[feature], start_date, end_date, coocurring_precision) else : transformers[feature] = ValuesFeatureTransformer(MatrixFeatureTransformer, feature, data[feature], metadata[feature], start_date, end_date, coocurring_precision) transformers[feature].transform() features_name.append(feature) values_name[features_name.index(feature)] = transformers[feature].values_labels #construct the time feature feature = "time" timetrans = ValuesTimeFeatureTransformer(MatrixTimeFeatureTransformer, feature, start_date, end_date, coocurring_precision) timetrans.transform() transformers[ValuesTimeFeatureTransformer.day_label] = timetrans transformers[ValuesTimeFeatureTransformer.hour_label] = timetrans features_name.append(ValuesTimeFeatureTransformer.day_label) values_name[features_name.index(ValuesTimeFeatureTransformer.day_label)] = timetrans.day_values_labels features_name.append(ValuesTimeFeatureTransformer.hour_label) values_name[features_name.index(ValuesTimeFeatureTransformer.hour_label)] = timetrans.time_values_labels records_labels = timetrans.records_dates records_nb = len(records_labels) #make space for records for r in range(records_nb): records.append({}) #II-fill the records for fid, fname in enumerate(features_name): if fname == ValuesTimeFeatureTransformer.day_label: for r in range(records_nb): if transformers[fname].day_values_data[r]!= []: records[r][fid] = transformers[fname].day_values_data[r]; elif fname == ValuesTimeFeatureTransformer.hour_label: for r in range(records_nb): if transformers[fname].time_values_data[r]!= []: records[r][fid] = transformers[fname].time_values_data[r]; else: for r in range(records_nb): if transformers[fname].values_data[r]!= []: records[r][fid] = transformers[fname].values_data[r]; #remove the ones that only contain value for the time feature for r in range(records_nb-1, -1, -1): #Decreasing loop over the records so that remove is possible if len(records[r]) <= 2: del records[r] #for the remaining records, add non_present values for the non_persistant features that are not present in each record. non_persistant for nf in nonpersistent_features: #add the non_present value as a value that can be taken by the non persistent features if nf in features_name: nfid = features_name.index(nf) values_name[nfid].append(nonpresent_v) rtv_data = {} for idr, r in enumerate(records): for nf in nonpersistent_features: if nf in features_name: nfid = features_name.index(nf) if nfid not in r: r[nfid]=[values_name[nfid].index(nonpresent_v)]; rtv_data[idr]=r print "first date of observation "+str(start_date) print "first date of observation "+str(end_date) print "features names "+str(features_name) print "values names : "+str(values_name) print "number of records "+str(len(rtv_data)) #write the data, the record dates, the feature names and the value names RVFDataExtractor.save_rvf(user_id, rtv_data, features_name, values_name, records_labels)
#!/usr/bin/env python import sys sys.path.insert(0, "/home/dehajjik/workspace/src/utils") from compute_ghcm_mdt_one_user import compute_ghcm_mdt_one_user as cgou from rfv_data_utils import RVFDataExtractor ''' make the categorized data transformation for all the users ''' for user_id in RVFDataExtractor.users_ids_list(): print("user " + str(user_id) + " on working") cgou(user_id) print("user " + str(user_id) + " extracted")