def getGoodUsers(): """ get list of users ids that have visited greater or equal than <MIN_EVENTS> number of events """ print "Getting list of users that have enough visits..." # fill dictionary of visits users_dict = {} history_file = open(history_file_name, 'r') # skip 3 lines of comments for i in range(3): history_file.readline() for line in history_file: user_id = misc_functions.getMeta(line, 0) item_id = misc_functions.getMeta(line, 1) if user_id in users_dict: users_dict[user_id].append(item_id) else: users_dict[user_id] = [] # filter out users with less than <MIN_EVENTS> visits good_users_list = [] for user_id in users_dict: if len(users_dict[user_id]) >= MIN_EVENTS: good_users_list.append(user_id) print "Filtered users with enough visits: ", len(good_users_list) return good_users_list
def rewriteUsersData(users_IDs_map, good_users_list): """ creates new file with users meta; in fact just replaces old users IDs with new IDs """ print "Writing new users data..." users_file = open(users_file_name, 'r') good_users_lines = [] for user_line in users_file: old_user_id = misc_functions.getMeta(user_line, 0) if old_user_id in good_users_list: rest_of_line = user_line[user_line.find('\t'):] good_users_lines.append(str(users_IDs_map[old_user_id]) + rest_of_line) users_file.close() # sort users lines by new user id sorted_users_lines = misc_functions.sortMetaListByMeta(good_users_lines, 0) # write new file with users data new_users_file = open(users_file_name + ".preprocessed", 'w') for user_line in sorted_users_lines: new_users_file.write(user_line) new_users_file.close() print "Users data with new IDs written."
def rewriteUsersData(users_IDs_map, good_users_list): """ creates new file with users meta; in fact just replaces old users IDs with new IDs """ print "Writing new users data..." users_file = open(users_file_name, 'r') good_users_lines = [] for user_line in users_file: old_user_id = misc_functions.getMeta(user_line, 0) if old_user_id in good_users_list: rest_of_line = user_line[user_line.find('\t'):] good_users_lines.append( str(users_IDs_map[old_user_id]) + rest_of_line) users_file.close() # sort users lines by new user id sorted_users_lines = misc_functions.sortMetaListByMeta(good_users_lines, 0) # write new file with users data new_users_file = open(users_file_name + ".preprocessed", 'w') for user_line in sorted_users_lines: new_users_file.write(user_line) new_users_file.close() print "Users data with new IDs written."
def createNewEventsIDs(): """ sorts events by time; creates dictionary with key - old event ID, meaning - new event ID """ print "Creating new IDs for events..." print "Sorting events by time..." time_sorted_meta_list = misc_functions.sortMetaListByTime( global_meta_list, TIME_META_POSITION) # start counting from 1 new_event_id = 1 events_IDs_map = {} for event_line in time_sorted_meta_list: old_event_id = misc_functions.getMeta(event_line, 0) events_IDs_map[old_event_id] = new_event_id new_event_id += 1 print "New IDs for events created." return [events_IDs_map, time_sorted_meta_list]
def createItem_x_MetaMatrix_global(): """ Function computes item_X_meta matrix for all the seminars (that shows current meta), and saves this matrix as "items-metas_global.mtx" in "data/well_done" directory. """ print "STARTED computing global Item_x_Meta matrix" seminars_meta_file = open("../data/well_done/meta", 'r') item_X_meta_matrix = numpy.zeros((ITEMS_NUMB, MAX_METAS_NUMB), dtype = int) for seminar_id in range(ITEMS_NUMB): seminar_string = seminars_meta_file.readline() #for meta in METAS_TO_USE: for meta in range(18): item_X_meta_matrix[seminar_id][meta] = misc_functions.getMeta(seminar_string, meta) seminars_meta_file.close() csr_matrix_to_write = scipy.sparse.csr_matrix(item_X_meta_matrix) scipy.io.mmio.mmwrite("../data/well_done/items-metas_global", csr_matrix_to_write) print "SUCCESS! \n ITEMS x METAS global matrix - prepared!"
def createVisitsMatrix( users_IDs_map, events_IDs_map, \ good_users_list, good_events_list): """ creates new matrix of visits; result matrix contains only events with meta data and users with more than <MIN_EVENTS> visits each; """ print "Creating new visits matrix..." old_history_file = open(history_file_name, 'r') # skip 3 lines of header for i in range(3): old_history_file.readline() new_history_lines = [] for line in old_history_file: old_user_id = misc_functions.getMeta(line, 0) old_event_id = misc_functions.getMeta(line, 1) if (old_user_id in good_users_list) and (old_event_id in good_events_list): ltw = str(users_IDs_map[old_user_id]) + '\t' + str( events_IDs_map[old_event_id]) + '\t' + '1' + '\n' new_history_lines.append(ltw) old_history_file.close() new_history_file = open(history_file_name + ".preprocessed", 'w') # write header for new history file new_history_file.write( "%%MatrixMarket matrix coordinate integer general\n") new_history_file.write("%% Created by CTHULHU\n") new_history_file.write(str(len(good_users_list)) + '\t' + \ str(len(good_events_list)) + '\t' + \ str(len(new_history_lines)) + '\n' ) for line in new_history_lines: new_history_file.write(line) new_history_file.close() print "New visits matrix created."
def createVisitsMatrix( users_IDs_map, events_IDs_map, \ good_users_list, good_events_list): """ creates new matrix of visits; result matrix contains only events with meta data and users with more than <MIN_EVENTS> visits each; """ print "Creating new visits matrix..." old_history_file = open(history_file_name, 'r') # skip 3 lines of header for i in range(3): old_history_file.readline() new_history_lines = [] for line in old_history_file: old_user_id = misc_functions.getMeta(line, 0) old_event_id = misc_functions.getMeta(line, 1) if (old_user_id in good_users_list) and (old_event_id in good_events_list): ltw = str(users_IDs_map[old_user_id]) + '\t' + str(events_IDs_map[old_event_id]) + '\t' + '1' + '\n' new_history_lines.append(ltw) old_history_file.close() new_history_file = open(history_file_name + ".preprocessed", 'w') # write header for new history file new_history_file.write("%%MatrixMarket matrix coordinate integer general\n") new_history_file.write("%% Created by CTHULHU\n") new_history_file.write(str(len(good_users_list)) + '\t' + \ str(len(good_events_list)) + '\t' + \ str(len(new_history_lines)) + '\n' ) for line in new_history_lines: new_history_file.write(line) new_history_file.close() print "New visits matrix created."
def rewriteEventsData(events_IDs_map, time_sorted_meta_list): """ creates new file with events meta; in fact just replaces old events IDs with new IDs; takes into account that seminars has been time sorted already """ print "Writing new events data..." new_events_file = open(events_file_name + ".preprocessed", 'w') for event_line in time_sorted_meta_list: old_event_id = misc_functions.getMeta(event_line, 0) rest_of_line = event_line[event_line.find('\t'):] new_events_file.write(str(events_IDs_map[old_event_id]) + rest_of_line) new_events_file.close() print "Events data with new IDs and time-sorted events written."
def getGoodEvents(): """ get list of events ids that have meta data; IMPORTANT: fills list of meta data for events - global var <global_meta_list> """ print "Getting list of events ids that have meta data..." events_file = open(events_file_name, 'r') events_with_meta_list = [] for line in events_file: global_meta_list.append(line) event_id = misc_functions.getMeta(line, 0) if event_id not in events_with_meta_list: events_with_meta_list.append(event_id) events_file.close() print "Filtered events with meta data: ", len(events_with_meta_list) return events_with_meta_list
def createNewEventsIDs(): """ sorts events by time; creates dictionary with key - old event ID, meaning - new event ID """ print "Creating new IDs for events..." print "Sorting events by time..." time_sorted_meta_list = misc_functions.sortMetaListByTime(global_meta_list, TIME_META_POSITION) # start counting from 1 new_event_id = 1 events_IDs_map = {} for event_line in time_sorted_meta_list: old_event_id = misc_functions.getMeta(event_line, 0) events_IDs_map[old_event_id] = new_event_id new_event_id += 1 print "New IDs for events created." return [events_IDs_map, time_sorted_meta_list]
def computeMetaMatrix(meta_list, meta_id_position): """ Internal function. Function computes user_X_meta matrix for current <meta_id_position> (that shows current meta), and saves this matrix as "users-<meta>.mtx in directory for current case. """ #print "meta_id_position = ", meta_id_position # creating new matrix meta_matrix = numpy.zeros((USERS_NUMB, 0), dtype = int) # reading local history matrix history_matrix = scipy.io.mmio.mmread("history.mtx").tocsr() # some routine before main loop cur_meta_items = [] """ id-- here????? """ #cur_meta_id = 1 cur_meta_id = 0 # MAIN LOOP for line in meta_list: # seminar with unknown meta is ignored if (misc_functions.getMeta(line, meta_id_position) == -1): continue """ !!!! DAFUQ!!!! """ line_semin_id = getMeta(line, 0) # considering everywhere semin_id_position == 0 #line_meta_id = int(line.split('\t')[meta_id_position]) line_meta_id = getMeta(line, meta_id_position) #print line #print line_semin_id #print line_meta_id #step() # PRICE HEURISTIC if meta_id_position == 8: line_meta_id = priceToPriceCat(line_meta_id) # TIME HEURISTIC if meta_id_position == 5: line_meta_id = dayTime(line_meta_id) #print line_semin_id #step() # if new meta_id detected if line_meta_id != cur_meta_id: print line_meta_id new_meta_col = numpy.zeros((USERS_NUMB, 1), dtype = int) for cur_item in cur_meta_items: cur_item_col = (history_matrix[ : , cur_item]).toarray() new_meta_col = new_meta_col + cur_item_col if cur_meta_items != []: meta_matrix = numpy.hstack((meta_matrix, new_meta_col)) # stacking empty columns of meta for not visited seminars while(line_meta_id != cur_meta_id + 1): meta_matrix = numpy.hstack((meta_matrix, numpy.zeros((USERS_NUMB, 1), dtype = int))) cur_meta_id += 1 # clean list if new meta_id begins #print "meta_id = ", cur_meta_id, "; visiters = ", len(cur_meta_items) cur_meta_items = [] """ !!! """ cur_meta_id += 1 if cur_meta_id != line_meta_id: raise Exception("mismatch cur_meta_id") """ !!! """ cur_meta_items.append(line_semin_id) # stacking last column new_meta_col = numpy.zeros((USERS_NUMB, 1), dtype = int) for cur_item in cur_meta_items: cur_item_col = (history_matrix[ : , cur_item]).toarray() new_meta_col = new_meta_col + cur_item_col if cur_meta_items != []: meta_matrix = numpy.hstack((meta_matrix, new_meta_col)) meta_matrix_csr = scipy.sparse.csr_matrix(meta_matrix) # writing new <meta_matrix> to file now = datetime.datetime.now() now_string = now.strftime("%Y-%m-%d %H:%M") meta_matrix_file_name = "users-" + METAS_TO_USE[meta_id_position] scipy.io.mmio.mmwrite(meta_matrix_file_name, meta_matrix_csr, now_string, 'integer')
if ".." not in sys.path: sys.path.insert(0, "..") import misc_functions # file with events meta descriptions dataset_dir = "../../data/datasets" dataset_name = "Timepad/raw" events_file_name = "events.preprocessed" events_file = open(dataset_dir + "/" + dataset_name + "/" + events_file_name, 'r') # file to save new meta lines meta_numb = 8 #meta_name = "headers" meta_file = open(dataset_dir + "/" + dataset_name + "/times", 'w') for line in events_file: ltw = "" #print line #id = misc_functions.getMeta(line, 0) #ltw = ltw + str(id) + '\t' meta = str(misc_functions.getMeta(line, meta_numb)) ltw += meta meta_file.write(ltw + '\n') events_file.close() meta_file.close()
if ".." not in sys.path: sys.path.insert(0, "..") import misc_functions # file with events meta descriptions dataset_dir = "../../data/datasets" dataset_name = "Timepad/raw" events_file_name = "events.preprocessed" events_file = open(dataset_dir + "/" + dataset_name + "/" + events_file_name, 'r') # file to save new meta lines meta_numb = 8 #meta_name = "headers" meta_file = open(dataset_dir + "/" + dataset_name + "/times", 'w') for line in events_file: ltw = "" #print line #id = misc_functions.getMeta(line, 0) #ltw = ltw + str(id) + '\t' meta = str( misc_functions.getMeta(line, meta_numb) ) ltw += meta meta_file.write(ltw + '\n') events_file.close() meta_file.close()
def estimateNDCGp( test_matrix, prediction_matrix, clusters_list, coords, results_file_name ): """ Function estimates accuracy of prediction in each case of cross-validation - it gets results of prediction from "prediction.mtx", test matrix from "test.mtx", and get info about clusters from "test_clusters" file. Function uses normalized DCGp metrics - it counts error for each cluster and get average error between clusters per user and then average for all users in current cross-validation case. """ # file stuff: remove after check - we get matrices from call above #prediction_matrix = scipy.io.mmio.mmread(prediction_file_name).tocsr() #test_matrix = scipy.io.mmio.mmread("test.mtx").tocsr() """ # == DEBUG PRINT ===================================================== for user_cluster in clusters_list: for line in user_cluster: print line print "Press any key to continue:" sys.stdin.read(1) # == \DEBUG PRINT ===================================================== """ local_average_nDCGp = float(0.0) local_average_p = float(0.0) #user_ctr = 0 for user_cluster in clusters_list: #misc_functions.step() #print "user = "******"user_cluster = ", user_cluster #print "user_cluster[0] = ", user_cluster[0] user = int(((user_cluster[0]).split("\t"))[1]) #print "user = "******"user_prediction = ", user_prediction #print "user_visits = ", user_visits user_average_nDCGp = float(0.0) user_average_p = float(0.0) for byte in user_cluster[1 : ] : #misc_functions.step() byte_visits = user_visits[misc_functions.getMeta(byte, 0) : misc_functions.getMeta(byte, 2) + 1] byte_predictions = user_prediction[misc_functions.getMeta(byte, 0) : misc_functions.getMeta(byte, 2) + 1] #print "byte_visits = ", byte_visits #print "byte_predictions = ", byte_predictions # get indices of columns by decreasing of prediction value indices = numpy.lexsort(keys = (-byte_predictions, -byte_predictions)) ideal_indices = numpy.lexsort(keys = (byte_visits, byte_visits)) #print "indices = ", indices #print "ideal_indices = ", ideal_indices # sort vector of visits and predictions with help of indices sorted_predictions = byte_predictions.take(indices, axis = 0) sorted_visits = byte_visits.take(indices, axis = 0) #sorted_predictions = numpy.sort(sorted_predictions) #sorted_visits = #print "sorted_visits = ", sorted_visits #print "sorted_predictions = ", sorted_predictions if len(sorted_predictions) != len(sorted_visits): raise Exception("visits and prediction clusters have different size") #print "sorted_visits[] = ", sorted_visits """ TO BE NORMALIZED """ nDCGp = float(0.0) p = len(sorted_predictions) for i in range(p): if sorted_visits[i] == 1: nDCGp = float(math.log(2, i + 2)) break """ for i in range(1, p + 1): #print "i" nDCGp += float(sorted_visits[i - 1]) / float(math.log(i + 1, 2)) """ """ / TO BE NORMALIZED """ #if (nDCGp > 1.0): #nDCGp = 1.0 #print "nDCGp = ", nDCGp #step() #if (nDCGp < 0.0): #raise Exception("Incorrect nDCGp") if (nDCGp < 0.0) or (nDCGp > 1.0): print "nDCGp = ", nDCGp raise Exception("Incorrect nDCGp") #print "nDCGp = ", nDCGp user_average_nDCGp += nDCGp user_average_p += p if (len(user_cluster) != 1): # WHY -1?? user_average_nDCGp /= (len(user_cluster) - 1) # WHY -1?? user_average_p /= (len(user_cluster) - 1) local_average_nDCGp += user_average_nDCGp local_average_p += user_average_p local_average_nDCGp /= (len(clusters_list)) local_average_p /= (len(clusters_list)) if local_average_nDCGp != 0: local_average_position = math.pow(2, 1.0 / local_average_nDCGp) else: local_average_position = "FAR UNKNOWN" print "nDCGp for case = ", local_average_nDCGp print "average p = ", local_average_p print "average position", local_average_position results_file = open(results_file_name + ".nDCGp", 'w') results_file.write(str(local_average_nDCGp)) results_file.close() return [local_average_nDCGp, local_average_p, local_average_position]
def estimateNDCGp(test_matrix, prediction_matrix, clusters_list, coords, results_file_name): """ Function estimates accuracy of prediction in each case of cross-validation - it gets results of prediction from "prediction.mtx", test matrix from "test.mtx", and get info about clusters from "test_clusters" file. Function uses normalized DCGp metrics - it counts error for each cluster and get average error between clusters per user and then average for all users in current cross-validation case. """ # file stuff: remove after check - we get matrices from call above #prediction_matrix = scipy.io.mmio.mmread(prediction_file_name).tocsr() #test_matrix = scipy.io.mmio.mmread("test.mtx").tocsr() """ # == DEBUG PRINT ===================================================== for user_cluster in clusters_list: for line in user_cluster: print line print "Press any key to continue:" sys.stdin.read(1) # == \DEBUG PRINT ===================================================== """ local_average_nDCGp = float(0.0) local_average_p = float(0.0) #user_ctr = 0 for user_cluster in clusters_list: #misc_functions.step() #print "user = "******"user_cluster = ", user_cluster #print "user_cluster[0] = ", user_cluster[0] user = int(((user_cluster[0]).split("\t"))[1]) #print "user = "******"user_prediction = ", user_prediction #print "user_visits = ", user_visits user_average_nDCGp = float(0.0) user_average_p = float(0.0) for byte in user_cluster[1:]: #misc_functions.step() byte_visits = user_visits[misc_functions.getMeta(byte, 0): misc_functions.getMeta(byte, 2) + 1] byte_predictions = user_prediction[misc_functions.getMeta( byte, 0):misc_functions.getMeta(byte, 2) + 1] #print "byte_visits = ", byte_visits #print "byte_predictions = ", byte_predictions # get indices of columns by decreasing of prediction value indices = numpy.lexsort(keys=(-byte_predictions, -byte_predictions)) ideal_indices = numpy.lexsort(keys=(byte_visits, byte_visits)) #print "indices = ", indices #print "ideal_indices = ", ideal_indices # sort vector of visits and predictions with help of indices sorted_predictions = byte_predictions.take(indices, axis=0) sorted_visits = byte_visits.take(indices, axis=0) #sorted_predictions = numpy.sort(sorted_predictions) #sorted_visits = #print "sorted_visits = ", sorted_visits #print "sorted_predictions = ", sorted_predictions if len(sorted_predictions) != len(sorted_visits): raise Exception( "visits and prediction clusters have different size") #print "sorted_visits[] = ", sorted_visits """ TO BE NORMALIZED """ nDCGp = float(0.0) p = len(sorted_predictions) for i in range(p): if sorted_visits[i] == 1: nDCGp = float(math.log(2, i + 2)) break """ for i in range(1, p + 1): #print "i" nDCGp += float(sorted_visits[i - 1]) / float(math.log(i + 1, 2)) """ """ / TO BE NORMALIZED """ #if (nDCGp > 1.0): #nDCGp = 1.0 #print "nDCGp = ", nDCGp #step() #if (nDCGp < 0.0): #raise Exception("Incorrect nDCGp") if (nDCGp < 0.0) or (nDCGp > 1.0): print "nDCGp = ", nDCGp raise Exception("Incorrect nDCGp") #print "nDCGp = ", nDCGp user_average_nDCGp += nDCGp user_average_p += p if (len(user_cluster) != 1): # WHY -1?? user_average_nDCGp /= (len(user_cluster) - 1) # WHY -1?? user_average_p /= (len(user_cluster) - 1) local_average_nDCGp += user_average_nDCGp local_average_p += user_average_p local_average_nDCGp /= (len(clusters_list)) local_average_p /= (len(clusters_list)) if local_average_nDCGp != 0: local_average_position = math.pow(2, 1.0 / local_average_nDCGp) else: local_average_position = "FAR UNKNOWN" print "nDCGp for case = ", local_average_nDCGp print "average p = ", local_average_p print "average position", local_average_position results_file = open(results_file_name + ".nDCGp", 'w') results_file.write(str(local_average_nDCGp)) results_file.close() return [local_average_nDCGp, local_average_p, local_average_position]