Esempio n. 1
0
def compute_ghcm_mdt_one_user(user_id):
    file_name = "ghcm_mdt_user_" + str(user_id)
    print "loading matrix user " + str(user_id) + "..."
    [rfvdata, featuresnames, valuesnames,
     recordsdates] = RVFDataExtractor.load_rvf(user_id)
    print "values" + JsonUtils.dict_as_json_str(valuesnames)
    print "data" + JsonUtils.dict_as_json_str(rfvdata[0])
    vocab_size = [len(valuesnames[f]) for f in range(len(valuesnames.keys()))]
    print "user " + str(user_id) + " has " + str(
        len(featuresnames)) + " features and " + str(len(rfvdata)) + " records"
    print "features names"
    print featuresnames

    print "values" + JsonUtils.dict_as_json_str(valuesnames)
    for k in [10, 20, 30]:
        #compute the ghcm_mdt
        ghcm_mdt_comp = GHCM_MDTComputer(rfvdata, k, vocab_size)
        print "computing SVD for user " + str(user_id) + "..."
        ghcm_mdt_comp.compute()

        print "constructing interpretable output for user " + str(
            user_id) + "..."
        ghcm_mdt_comp.construct_rows_interpretable_output(
            featuresnames, valuesnames, disp_m)
        r_output = ghcm_mdt_comp.rows_interpretable_output

        #write the result
        print "writing SVD result for user " + str(user_id) + "..."
        JsonLogsFileWriter.write(r_output, file_name)
def transform_to_rv_one_user(user_id):

    print "loading data for user " + str(user_id)
    [rfv, featuresnames, valuesnames,
     recordsdates] = RVFDataExtractor.load_rvf(user_id)

    realization_names = []
    rvid_table = dict()
    id = 0
    for f in valuesnames:
        f_name = featuresnames[f]
        rvid_table[f] = dict()
        for v, v_name in enumerate(valuesnames[f]):
            realization_names += [f_name + "__" + v_name]
            rvid_table[f][v] = id
            id += 1

    rv_data = dict()
    for r in rfv:
        rv_data[r] = []
        for f in rfv[r]:
            for v in rfv[r][f]:
                rv_data[r] += [rvid_table[f][v]]

    #pdb.set_trace()

    RVDataExtractor.save_rv(user_id, rv_data, realization_names, recordsdates)
def compute_ghcm_mdt_one_user(user_id):
	file_name = "ghcm_mdt_user_"+str(user_id)
	print "loading matrix user "+str(user_id)+"..."
	[rfvdata, featuresnames, valuesnames, recordsdates] = RVFDataExtractor.load_rvf(user_id)
	print "values" + JsonUtils.dict_as_json_str(valuesnames)
	print "data" + JsonUtils.dict_as_json_str(rfvdata[0])
	vocab_size = [len(valuesnames[f]) for f in range(len(valuesnames.keys()))]
	print "user "+str(user_id)+" has "+str(len(featuresnames))+" features and "+str(len(rfvdata))+" records"
	print "features names"
	print featuresnames
	
	print "values" + JsonUtils.dict_as_json_str(valuesnames)
	for k in [10,20,30]:
		#compute the ghcm_mdt
		ghcm_mdt_comp = GHCM_MDTComputer(rfvdata, k, vocab_size)
		print "computing SVD for user "+str(user_id)+"..."
		ghcm_mdt_comp.compute()
		
		print "constructing interpretable output for user "+str(user_id)+"..."
		ghcm_mdt_comp.construct_rows_interpretable_output(featuresnames, valuesnames, disp_m)
		r_output = ghcm_mdt_comp.rows_interpretable_output
		
		#write the result
		print "writing SVD result for user "+str(user_id)+"..."
		JsonLogsFileWriter.write(r_output, file_name)
def transform_to_rv_one_user(user_id):
	
		
	print "loading data for user "+str(user_id)
	[rfv, featuresnames, valuesnames, recordsdates] = RVFDataExtractor.load_rvf(user_id)
	
	
	realization_names = []
	rvid_table = dict()
	id = 0
	for f in valuesnames:
		f_name = featuresnames[f]
		rvid_table[f]=dict()
		for v, v_name in enumerate(valuesnames[f]):
			realization_names += [f_name+"__"+v_name]
			rvid_table[f][v]=id
			id+=1
	
	rv_data = dict()
	for r in rfv:
		rv_data[r] = []
		for f in rfv[r]:
			for v in rfv[r][f]:
				rv_data[r] += [rvid_table[f][v]]
				
	#pdb.set_trace()
	
	RVDataExtractor.save_rv(user_id, rv_data, realization_names, recordsdates)
	

	
	
def evaluation_on_day_prediction_one_user(user_id):
	print "loading matrix user "+str(user_id)+"..."
	data_matrix = MDataExtractor.load_matrix(user_id)
	rows_labels =  MDataExtractor.load_labels_vector(user_id)
	columns_labels = MDataExtractor.load_time_vector(user_id)
	importance_scores = MDataExtractor.load_importance_scores(user_id)
	print "user "+str(user_id)+" has "+str(len(rows_labels))+" features (rows) and "+str(len(columns_labels))+" realization (columns)"
	
	print "loading rfv user "+str(user_id)+"..."
	[rfvdata, featuresnames, valuesnames, recordsdates] = RVFDataExtractor.load_rvf(user_id)
	vocab_size = [len(valuesnames[f]) for f in range(len(valuesnames.keys()))]
	print "user "+str(user_id)+" has "+str(len(rfvdata))+" records"
	
	print "loading rv user "+str(user_id)+"..."
	[rv_data, realizationsnames, recordsdates] = RVDataExtractor.load_rv(user_id)
	realization_size = len(realizationsnames)
	
	
	classifiers = {"Random ": RandomClassifier()}#add the random classifier
	classifiers = {"Most Frequent ": SVDClassifier("idf", 1)}#add the most frequent classifier
	
	
	'''
	for k in range(1,21):
		classifiers["GHCM_MDT k="+str(k)] = GHCM_MDT_Value_Classifier(k, vocab_size) #add a ghcmdt classifier for each k
		classifiers["LCBMF k="+str(k)] = LCBMFClassifier(None, None, None, k) #add a lcbmf classifier for each k
		 #add a svd classifier for each k'''
		
	'''for k in [1,10,20,30,40,50,60]:
		c =GHCM_MDT_Value_Classifier(k, vocab_size)
		classifiers["GHCM_MDT k="+str(k)] = c'''
		
	for k in [1,10,20,30,50,70]:
		classifiers["SVD k="+str(k)] = SVDClassifier("idf", k)
		classifiers["LDA k="+str(k)] = LDA_Value_Classifier(k, realization_size)
		classifiers["HCM_MDT k="+str(k)] = HCM_MDT_Value_Classifier(k, vocab_size)
		classifiers["GHCM_MDT k="+str(k)] = GHCM_MDT_Value_Classifier(k, vocab_size)
		classifiers["LCBMF k="+str(k)] = LCBMFClassifier(None, None, None, k)
		
	
	eval_handler = FpEvaluatorHandler()
	eval_handler.instanciate_m (MDayPredictionEvaluator, data_matrix, rows_labels, columns_labels)
	eval_handler.instanciate_rvf (RfvDayPredictionEvaluator, rfvdata, featuresnames, valuesnames)
	eval_handler.instantiate_rv(RvDayPredictionEvaluator,rv_data, realizationsnames)
	
	evaluations = {}
	nb=1
	for name, classifier in classifiers.iteritems():
		print "evaluating "+name+" on user "+str(user_id)+" nb "+str(nb)+"/"+str(len(classifiers))
		#results = [good_predictions, total_predictions, accuracy, macro_average_acc_by_class, accuracy_class1,...,accuracy_classn]
		results = eval_handler.evaluate(classifier)
		evaluations[name] = results
		nb+=1
		
		
	classes = 	eval_handler.classes
	return [evaluations, classes]	
Esempio n. 6
0
def evaluation_on_applaunch_prediction_one_user(user_id):
    print "loading rfv user " + str(user_id) + "..."
    [rfvdata, featuresnames, valuesnames,
     recordsdates] = RVFDataExtractor.load_rvf(user_id)
    vocab_size = [len(valuesnames[f]) for f in range(len(valuesnames.keys()))]
    print "user " + str(user_id) + " has " + str(len(rfvdata)) + " records"

    print "loading rv user " + str(user_id) + "..."
    [rv_data, realizationsnames,
     recordsdates] = RVDataExtractor.load_rv(user_id)
    realization_size = len(realizationsnames)

    classifiers = {"Random ": RandomClassifier()}  #add the random classifier
    classifiers = {
        "Most Frequent ": HCM_MDT_Value_Classifier(1, vocab_size)
    }  #add the most frequent classifier
    '''
	for k in range(1,21):
		classifiers["GHCM_MDT k="+str(k)] = GHCM_MDT_Value_Classifier(k, vocab_size) #add a ghcmdt classifier for each k
		classifiers["LCBMF k="+str(k)] = LCBMFClassifier(None, None, None, k) #add a lcbmf classifier for each k
		 #add a svd classifier for each k'''
    '''for k in [1,10,20,30,40,50,60]:
		c =GHCM_MDT_Value_Classifier(k, vocab_size)
		classifiers["GHCM_MDT k="+str(k)] = c'''

    for k in [1, 10, 20, 30, 50, 70]:
        classifiers["LDA k=" + str(k)] = LDA_Value_Classifier(
            k, realization_size)
        classifiers["GHCM_MDT k=" + str(k)] = GHCM_MDT_Value_Classifier(
            k, vocab_size)
        classifiers["HCM_MDT k=" + str(k)] = HCM_MDT_Value_Classifier(
            k, vocab_size)

    eval_handler = FpEvaluatorHandler()
    eval_handler.instanciate_rvf(RfvApplaunchPredictionEvaluator, rfvdata,
                                 featuresnames, valuesnames)
    eval_handler.instantiate_rv(RvApplaunchPredictionEvaluator, rv_data,
                                realizationsnames)

    evaluations = {}
    nb = 1
    for name, classifier in classifiers.iteritems():
        print "evaluating " + name + " on user " + str(user_id) + " nb " + str(
            nb) + "/" + str(len(classifiers))
        #results = [good_predictions, total_predictions, accuracy, macro_average_acc_by_class, accuracy_class1,...,accuracy_classn]
        results = eval_handler.evaluate(classifier)
        evaluations[name] = results
        nb += 1

    classes = eval_handler.classes
    return [evaluations, classes]
Esempio n. 7
0
def evaluation_perplexity_one_user(user_id):
	print "loading rfv user "+str(user_id)+"..."
	[rfvdata, featuresnames, valuesnames, recordsdates] = RVFDataExtractor.load_rvf(user_id)
	vocab_size = [len(valuesnames[f]) for f in range(len(valuesnames.keys()))]
	print "user "+str(user_id)+" has "+str(len(rfvdata))+" records"
	
	classifiers={}
	'''
	for k in range(1,21):
		classifiers["GHCM_MDT k="+str(k)] = GHCM_MDT_Value_Classifier(k, vocab_size) #add a ghcmdt classifier for each k
		classifiers["LCBMF k="+str(k)] = LCBMFClassifier(None, None, None, k) #add a lcbmf classifier for each k
		classifiers["SVD k="+str(k)] = SVDClassifier("idf", k) #add a svd classifier for each k'''
	k = 8	
	for k in [1,3,5,7,10]:
		#classifiers["GHCM_MDT k="+str(k)] = GHCM_MDT_Value_Classifier(k, vocab_size)
		classifiers["HCM_MDT k="+str(k)] = HCM_MDT_Value_Classifier(k, vocab_size)
		
		
		
		
	
	eval_handler = PerpEvaluatorHandler()
	#eval_handler.instanciate_rv (MLocationPredictionEvaluator, data_matrix, rows_labels, columns_labels)
	eval_handler.instanciate_rvf (RfvPerplexityEvaluator, rfvdata, featuresnames, valuesnames)
	
	evaluations = {}
	
	for name, classifier in classifiers.iteritems():
		print "evaluating "+name+" on user "+str(user_id)
		#results = [good_predictions, total_predictions, accuracy, macro_average_acc_by_class, accuracy_class1,...,accuracy_classn]
		results = eval_handler.evaluate(classifier)
		evaluations[name] = results
		print "results for "+name+" on user "+str(user_id)+" : "+str(results)
		
		
	return evaluations
	
				
		
Esempio n. 8
0
def evaluation_on_location_prediction_one_user(user_id):

    print "loading matrix user " + str(user_id) + "..."
    data_matrix = MDataExtractor.load_matrix(user_id)
    rows_labels = MDataExtractor.load_labels_vector(user_id)
    columns_labels = MDataExtractor.load_time_vector(user_id)
    importance_scores = MDataExtractor.load_importance_scores(user_id)
    print "user " + str(user_id) + " has " + str(
        len(rows_labels)) + " features (rows) and " + str(
            len(columns_labels)) + " realization (columns)"

    print "loading rfv user " + str(user_id) + "..."
    [rfvdata, featuresnames, valuesnames,
     recordsdates] = RVFDataExtractor.load_rvf(user_id)
    vocab_size = [len(valuesnames[f]) for f in range(len(valuesnames.keys()))]
    print "user " + str(user_id) + " has " + str(len(rfvdata)) + " records"

    print "loading rv user " + str(user_id) + "..."
    [rv_data, realizationsnames,
     recordsdates] = RVDataExtractor.load_rv(user_id)
    realization_size = len(realizationsnames)

    classifiers = {"Random ": RandomClassifier()}  #add the random classifier
    classifiers = {
        "Most Frequent ": SVDClassifier("idf", 1)
    }  #add the most frequent classifier
    '''
	for k in range(1,21):
		classifiers["GHCM_MDT k="+str(k)] = GHCM_MDT_Value_Classifier(k, vocab_size) #add a ghcmdt classifier for each k
		classifiers["LCBMF k="+str(k)] = LCBMFClassifier(None, None, None, k) #add a lcbmf classifier for each k
		classifiers["SVD k="+str(k)] = SVDClassifier("idf", k) #add a svd classifier for each k'''
    '''for k in [1,10,20,30,40,50,60]:
		c =GHCM_MDT_Value_Classifier(k, vocab_size)
		classifiers["GHCM_MDT k="+str(k)] = c'''
    '''for k in [10,15,20,25,30,35]:
		c =GHCM_MDT_Value_Classifier(k, vocab_size)
		classifiers["GHCM_MDT k="+str(k)] = c'''

    for k in [1, 10, 20, 30, 50, 70]:
        classifiers["SVD k=" + str(k)] = SVDClassifier("idf", k)
        classifiers["LDA k=" + str(k)] = LDA_Value_Classifier(
            k, realization_size)
        classifiers["HCM_MDT k=" + str(k)] = HCM_MDT_Value_Classifier(
            k, vocab_size)
        classifiers["GHCM_MDT k=" + str(k)] = GHCM_MDT_Value_Classifier(
            k, vocab_size)
        classifiers["LCBMF k=" + str(k)] = LCBMFClassifier(None, None, None, k)

    eval_handler = FpEvaluatorHandler()
    eval_handler.instanciate_m(MLocationPredictionEvaluator, data_matrix,
                               rows_labels, columns_labels)
    eval_handler.instanciate_rvf(RfvLocationPredictionEvaluator, rfvdata,
                                 featuresnames, valuesnames)
    eval_handler.instantiate_rv(RvLocationPredictionEvaluator, rv_data,
                                realizationsnames)

    evaluations = {}

    for name, classifier in classifiers.iteritems():
        print "evaluating " + name + " on user " + str(user_id)
        #results = [good_predictions, total_predictions, accuracy, macro_average_acc_by_class, accuracy_class1,...,accuracy_classn]
        results = eval_handler.evaluate(classifier)
        evaluations[name] = results

    classes = eval_handler.classes
    return [evaluations, classes]
def transform_to_rfv_one_user(user_id):
	
		
	print "loading data for user "+str(user_id)
	categorized_data = DataExtractor.load_json_data(user_id)
	data = DataExtractor.complete_data(categorized_data)
	metadata = DataExtractor.complete_metadata(categorized_data)
	
	#order the data by the alphabetic name of the features
	print "ordering data "+str(user_id)
	data = collections.OrderedDict(sorted(data.items()))
	
	#get the first date and the last date
	print "getting first date and last date "
	end_date = date_min
	start_date = datetime.now()
	for feature, feature_data in data.iteritems():
		feature_data = collections.OrderedDict(sorted(feature_data.items()))
		begin_date = DataExtractor.start_date_of_realization(feature_data.keys()[0])
		if begin_date < start_date:
			start_date = begin_date
			
		last_date = DataExtractor.start_date_of_realization(feature_data.keys()[len(feature_data.keys())-1])
		if last_date > end_date:
			end_date = last_date
		
		data[feature] = feature_data
	
	#construct the values data
	#I- construct the values for all the features
	print "constructing the values data"
	
	transformers = {} 
	features_name = []
	records = []
	values_name = {}
	for feature, feature_date in data.iteritems():
		if feature == "location":
			transformers[feature] = ValuesFeatureTransformer(MatrixLocationFeatureTransformer, feature, data[feature], metadata[feature], start_date, end_date, coocurring_precision)
		elif feature == "bluetoothSeen" or feature == "bluetoothPaired":
			transformers[feature] = ValuesFeatureTransformer(MatrixBleutoothFeatureTransformer, feature, data[feature], metadata[feature], start_date, end_date, coocurring_precision)
		else :
			transformers[feature] = ValuesFeatureTransformer(MatrixFeatureTransformer, feature, data[feature], metadata[feature], start_date, end_date, coocurring_precision)
			
		transformers[feature].transform()
		features_name.append(feature)
		values_name[features_name.index(feature)] = transformers[feature].values_labels
		
	
	
	#construct the time feature
	feature = "time"
	timetrans = ValuesTimeFeatureTransformer(MatrixTimeFeatureTransformer, feature, start_date, end_date, coocurring_precision)
	timetrans.transform()
	transformers[ValuesTimeFeatureTransformer.day_label] =  timetrans
	transformers[ValuesTimeFeatureTransformer.hour_label] =  timetrans
	features_name.append(ValuesTimeFeatureTransformer.day_label)
	values_name[features_name.index(ValuesTimeFeatureTransformer.day_label)] = timetrans.day_values_labels
	features_name.append(ValuesTimeFeatureTransformer.hour_label)
	values_name[features_name.index(ValuesTimeFeatureTransformer.hour_label)] = timetrans.time_values_labels
	
	records_labels =  timetrans.records_dates
	records_nb = len(records_labels)
	
	#make space for records
	for r in range(records_nb):
		records.append({})
	
	#II-fill the records
	for fid, fname in enumerate(features_name):
		if fname == ValuesTimeFeatureTransformer.day_label:
			for r in range(records_nb):
				if transformers[fname].day_values_data[r]!= []: records[r][fid] = transformers[fname].day_values_data[r];
		elif fname == ValuesTimeFeatureTransformer.hour_label:
			for r in range(records_nb):
				if transformers[fname].time_values_data[r]!= []: records[r][fid] = transformers[fname].time_values_data[r];
		else:
			for r in range(records_nb):
				if transformers[fname].values_data[r]!= []: records[r][fid] = transformers[fname].values_data[r];
	
	#remove the ones that only contain value for the time feature 
	for r in range(records_nb-1, -1, -1): #Decreasing loop over the records so that remove is possible
		if len(records[r]) <= 2:
			del records[r]
	
	#for the remaining records, add non_present values for the non_persistant features that are not present in each record. non_persistant
	for nf in nonpersistent_features: #add the non_present value as a value that can be taken by the non persistent features
		if nf in features_name: 
			nfid = features_name.index(nf)
			values_name[nfid].append(nonpresent_v)
	
	rtv_data = {}
	for idr, r in enumerate(records):
		for nf in nonpersistent_features:
			if nf in features_name:
				nfid = features_name.index(nf)
				if nfid not in r: r[nfid]=[values_name[nfid].index(nonpresent_v)];
		rtv_data[idr]=r
				
	print "first date of observation "+str(start_date)
	print "first date of observation "+str(end_date)
	print "features names "+str(features_name)
	print "values names : "+str(values_name)
	print "number of records "+str(len(rtv_data))
  
	#write the data, the record dates, the feature names and the value names
	RVFDataExtractor.save_rvf(user_id, rtv_data, features_name, values_name, records_labels)
	

	
	
Esempio n. 10
0
#!/usr/bin/env python
import sys
sys.path.insert(0, "/home/dehajjik/workspace/src/utils")
from compute_ghcm_mdt_one_user import compute_ghcm_mdt_one_user as cgou

from rfv_data_utils import RVFDataExtractor
'''
make the categorized data transformation for all the users
'''
for user_id in RVFDataExtractor.users_ids_list():
    print("user " + str(user_id) + " on working")
    cgou(user_id)

    print("user " + str(user_id) + " extracted")