コード例 #1
def getGoodUsers():
        get list of users ids that have visited greater or equal
        than <MIN_EVENTS> number of events

    print "Getting list of users that have enough visits..."

    # fill dictionary of visits
    users_dict = {}
    history_file = open(history_file_name, 'r')
    # skip 3 lines of comments
    for i in range(3):
    for line in history_file:
        user_id = misc_functions.getMeta(line, 0)
        item_id = misc_functions.getMeta(line, 1)
        if user_id in users_dict:
            users_dict[user_id] = []

    # filter out users with less than <MIN_EVENTS> visits
    good_users_list = []
    for user_id in users_dict:
        if len(users_dict[user_id]) >= MIN_EVENTS:

    print "Filtered users with enough visits: ", len(good_users_list)
    return good_users_list
コード例 #2
def getGoodUsers():
        get list of users ids that have visited greater or equal
        than <MIN_EVENTS> number of events

    print "Getting list of users that have enough visits..."

    # fill dictionary of visits
    users_dict = {}

    history_file = open(history_file_name, 'r')
    # skip 3 lines of comments
    for i in range(3):

    for line in history_file:
        user_id = misc_functions.getMeta(line, 0)
        item_id = misc_functions.getMeta(line, 1)

        if user_id in users_dict:
            users_dict[user_id] = []

    # filter out users with less than <MIN_EVENTS> visits
    good_users_list = []
    for user_id in users_dict:
        if len(users_dict[user_id]) >= MIN_EVENTS:

    print "Filtered users with enough visits: ", len(good_users_list)
    return good_users_list
コード例 #3
def rewriteUsersData(users_IDs_map, good_users_list):
        creates new file with users meta;
        in fact just replaces old users IDs with new IDs
    print "Writing new users data..."

    users_file = open(users_file_name, 'r')
    good_users_lines = []
    for user_line in users_file:
        old_user_id = misc_functions.getMeta(user_line, 0)
        if old_user_id in good_users_list:
            rest_of_line = user_line[user_line.find('\t'):]
            good_users_lines.append(str(users_IDs_map[old_user_id]) + rest_of_line)

    # sort users lines by new user id
    sorted_users_lines = misc_functions.sortMetaListByMeta(good_users_lines, 0)
    # write new file with users data
    new_users_file = open(users_file_name + ".preprocessed", 'w')
    for user_line in sorted_users_lines:
    print "Users data with new IDs written."
コード例 #4
def rewriteUsersData(users_IDs_map, good_users_list):
        creates new file with users meta;
        in fact just replaces old users IDs with new IDs
    print "Writing new users data..."

    users_file = open(users_file_name, 'r')

    good_users_lines = []

    for user_line in users_file:
        old_user_id = misc_functions.getMeta(user_line, 0)
        if old_user_id in good_users_list:
            rest_of_line = user_line[user_line.find('\t'):]
                str(users_IDs_map[old_user_id]) + rest_of_line)


    # sort users lines by new user id
    sorted_users_lines = misc_functions.sortMetaListByMeta(good_users_lines, 0)

    # write new file with users data
    new_users_file = open(users_file_name + ".preprocessed", 'w')
    for user_line in sorted_users_lines:

    print "Users data with new IDs written."
コード例 #5
def createNewEventsIDs():
        sorts events by time;
        creates dictionary with key - old event ID, meaning - new event ID

    print "Creating new IDs for events..."

    print "Sorting events by time..."
    time_sorted_meta_list = misc_functions.sortMetaListByTime(
        global_meta_list, TIME_META_POSITION)

    # start counting from 1
    new_event_id = 1

    events_IDs_map = {}

    for event_line in time_sorted_meta_list:
        old_event_id = misc_functions.getMeta(event_line, 0)

        events_IDs_map[old_event_id] = new_event_id
        new_event_id += 1

    print "New IDs for events created."
    return [events_IDs_map, time_sorted_meta_list]
コード例 #6
ファイル: prepare_matrices.py プロジェクト: armovetz/course4
def createItem_x_MetaMatrix_global():
        Function computes item_X_meta matrix for all the seminars
        (that shows current meta), and saves this matrix as "items-metas_global.mtx"
        in "data/well_done" directory.
    print "STARTED computing global Item_x_Meta matrix"

    seminars_meta_file = open("../data/well_done/meta", 'r')
    item_X_meta_matrix = numpy.zeros((ITEMS_NUMB, MAX_METAS_NUMB), dtype = int)
    for seminar_id in range(ITEMS_NUMB):
        seminar_string = seminars_meta_file.readline()
        #for meta in METAS_TO_USE:
        for meta in range(18):
            item_X_meta_matrix[seminar_id][meta] = misc_functions.getMeta(seminar_string, meta)

    csr_matrix_to_write = scipy.sparse.csr_matrix(item_X_meta_matrix)
    scipy.io.mmio.mmwrite("../data/well_done/items-metas_global", csr_matrix_to_write)

    print "SUCCESS! \n ITEMS x METAS global matrix - prepared!"
コード例 #7
def createVisitsMatrix( users_IDs_map,      events_IDs_map, \
                        good_users_list,    good_events_list):
        creates new matrix of visits;
        result matrix contains only events with meta data
            and users with more than <MIN_EVENTS> visits each;

    print "Creating new visits matrix..."

    old_history_file = open(history_file_name, 'r')

    # skip 3 lines of header
    for i in range(3):

    new_history_lines = []

    for line in old_history_file:
        old_user_id = misc_functions.getMeta(line, 0)
        old_event_id = misc_functions.getMeta(line, 1)

        if (old_user_id in good_users_list) and (old_event_id
                                                 in good_events_list):
            ltw = str(users_IDs_map[old_user_id]) + '\t' + str(
                events_IDs_map[old_event_id]) + '\t' + '1' + '\n'


    new_history_file = open(history_file_name + ".preprocessed", 'w')
    # write header for new history file
        "%%MatrixMarket matrix coordinate integer general\n")
    new_history_file.write("%% Created by CTHULHU\n")
    new_history_file.write(str(len(good_users_list)) + '\t'  + \
                           str(len(good_events_list)) + '\t' + \
                           str(len(new_history_lines)) + '\n' )

    for line in new_history_lines:


    print "New visits matrix created."
コード例 #8
def createVisitsMatrix( users_IDs_map,      events_IDs_map, \
                        good_users_list,    good_events_list):
        creates new matrix of visits;
        result matrix contains only events with meta data
            and users with more than <MIN_EVENTS> visits each;
    print "Creating new visits matrix..."
    old_history_file = open(history_file_name, 'r')

    # skip 3 lines of header
    for i in range(3):
    new_history_lines = []
    for line in old_history_file:
        old_user_id  = misc_functions.getMeta(line, 0)
        old_event_id = misc_functions.getMeta(line, 1)
        if (old_user_id in good_users_list) and (old_event_id in good_events_list):
            ltw = str(users_IDs_map[old_user_id]) + '\t' + str(events_IDs_map[old_event_id]) + '\t' + '1' + '\n'
    new_history_file = open(history_file_name + ".preprocessed", 'w')
    # write header for new history file
    new_history_file.write("%%MatrixMarket matrix coordinate integer general\n")
    new_history_file.write("%% Created by CTHULHU\n")
    new_history_file.write(str(len(good_users_list)) + '\t'  + \
                           str(len(good_events_list)) + '\t' + \
                           str(len(new_history_lines)) + '\n' )

    for line in new_history_lines:

    print "New visits matrix created."
コード例 #9
def rewriteEventsData(events_IDs_map, time_sorted_meta_list):
        creates new file with events meta;
        in fact just replaces old events IDs with new IDs;
        takes into account that seminars has been time sorted already

    print "Writing new events data..."
    new_events_file = open(events_file_name + ".preprocessed", 'w')

    for event_line in time_sorted_meta_list:
        old_event_id = misc_functions.getMeta(event_line, 0)
        rest_of_line = event_line[event_line.find('\t'):]
        new_events_file.write(str(events_IDs_map[old_event_id]) + rest_of_line)

    print "Events data with new IDs and time-sorted events written."
コード例 #10
def rewriteEventsData(events_IDs_map, time_sorted_meta_list):
        creates new file with events meta;
        in fact just replaces old events IDs with new IDs;
        takes into account that seminars has been time sorted already

    print "Writing new events data..."

    new_events_file = open(events_file_name + ".preprocessed", 'w')

    for event_line in time_sorted_meta_list:
        old_event_id = misc_functions.getMeta(event_line, 0)
        rest_of_line = event_line[event_line.find('\t'):]
        new_events_file.write(str(events_IDs_map[old_event_id]) + rest_of_line)


    print "Events data with new IDs and time-sorted events written."
コード例 #11
def getGoodEvents():
        get list of events ids that have meta data;
        IMPORTANT: fills list of meta data for events - global var <global_meta_list>

    print "Getting list of events ids that have meta data..."

    events_file = open(events_file_name, 'r')
    events_with_meta_list = []

    for line in events_file:

        event_id = misc_functions.getMeta(line, 0)
        if event_id not in events_with_meta_list:


    print "Filtered events with meta data: ", len(events_with_meta_list)
    return events_with_meta_list
コード例 #12
def getGoodEvents():
        get list of events ids that have meta data;
        IMPORTANT: fills list of meta data for events - global var <global_meta_list>
    print "Getting list of events ids that have meta data..."

    events_file = open(events_file_name, 'r')
    events_with_meta_list = []

    for line in events_file:

        event_id = misc_functions.getMeta(line, 0)
        if event_id not in events_with_meta_list:
    print "Filtered events with meta data: ", len(events_with_meta_list)
    return events_with_meta_list
コード例 #13
def createNewEventsIDs():
        sorts events by time;
        creates dictionary with key - old event ID, meaning - new event ID
    print "Creating new IDs for events..."
    print "Sorting events by time..."
    time_sorted_meta_list = misc_functions.sortMetaListByTime(global_meta_list, TIME_META_POSITION)
    # start counting from 1
    new_event_id = 1
    events_IDs_map = {}
    for event_line in time_sorted_meta_list:
        old_event_id = misc_functions.getMeta(event_line, 0)
        events_IDs_map[old_event_id] = new_event_id
        new_event_id += 1
    print "New IDs for events created."
    return [events_IDs_map, time_sorted_meta_list]
コード例 #14
ファイル: prepare_matrices.py プロジェクト: armovetz/course4
def computeMetaMatrix(meta_list, meta_id_position):
        Internal function.
        Function computes user_X_meta matrix for current <meta_id_position>
        (that shows current meta), and saves this matrix as "users-<meta>.mtx
        in directory for current case.

    #print "meta_id_position = ", meta_id_position

    # creating new matrix
    meta_matrix = numpy.zeros((USERS_NUMB, 0), dtype = int)
    # reading local history matrix
    history_matrix = scipy.io.mmio.mmread("history.mtx").tocsr()
    # some routine before main loop
    cur_meta_items = []
    """ id-- here????? """
    #cur_meta_id = 1
    cur_meta_id = 0 
    for line in meta_list:
        # seminar with unknown meta is ignored
        if (misc_functions.getMeta(line, meta_id_position) == -1):

        """ !!!! DAFUQ!!!! """
        line_semin_id = getMeta(line, 0) # considering everywhere semin_id_position == 0
        #line_meta_id = int(line.split('\t')[meta_id_position])
        line_meta_id = getMeta(line, meta_id_position)
        #print line
        #print line_semin_id
        #print line_meta_id
        if meta_id_position == 8:
            line_meta_id = priceToPriceCat(line_meta_id)
        if meta_id_position == 5:
            line_meta_id = dayTime(line_meta_id)
        #print line_semin_id

        # if new meta_id detected
        if line_meta_id != cur_meta_id:
            print line_meta_id
            new_meta_col = numpy.zeros((USERS_NUMB, 1), dtype = int)
            for cur_item in cur_meta_items:
                cur_item_col = (history_matrix[ : , cur_item]).toarray()
                new_meta_col = new_meta_col + cur_item_col

            if cur_meta_items != []:
                meta_matrix = numpy.hstack((meta_matrix, new_meta_col))
            # stacking empty columns of meta for not visited seminars
            while(line_meta_id != cur_meta_id + 1):
                meta_matrix = numpy.hstack((meta_matrix, numpy.zeros((USERS_NUMB, 1), dtype = int)))
                cur_meta_id += 1
            # clean list if new meta_id begins
            #print "meta_id = ", cur_meta_id, "; visiters = ", len(cur_meta_items)
            cur_meta_items = []
            """ !!! """
            cur_meta_id += 1
            if cur_meta_id != line_meta_id:
                raise Exception("mismatch cur_meta_id")
            """ !!! """

    # stacking last column
    new_meta_col = numpy.zeros((USERS_NUMB, 1), dtype = int)
    for cur_item in cur_meta_items:
        cur_item_col = (history_matrix[ : , cur_item]).toarray()
        new_meta_col = new_meta_col + cur_item_col
    if cur_meta_items != []:
        meta_matrix = numpy.hstack((meta_matrix, new_meta_col))
    meta_matrix_csr = scipy.sparse.csr_matrix(meta_matrix)
    # writing new <meta_matrix> to file    
    now = datetime.datetime.now()
    now_string = now.strftime("%Y-%m-%d %H:%M")
    meta_matrix_file_name = "users-" + METAS_TO_USE[meta_id_position]
    scipy.io.mmio.mmwrite(meta_matrix_file_name, meta_matrix_csr, now_string, 'integer')
コード例 #15
if ".." not in sys.path:
    sys.path.insert(0, "..")
import misc_functions

# file with events meta descriptions
dataset_dir = "../../data/datasets"
dataset_name = "Timepad/raw"
events_file_name = "events.preprocessed"
events_file = open(dataset_dir + "/" + dataset_name + "/" + events_file_name,

# file to save new meta lines
meta_numb = 8
#meta_name = "headers"
meta_file = open(dataset_dir + "/" + dataset_name + "/times", 'w')

for line in events_file:
    ltw = ""
    #print line

    #id = misc_functions.getMeta(line, 0)
    #ltw = ltw + str(id) + '\t'

    meta = str(misc_functions.getMeta(line, meta_numb))
    ltw += meta

    meta_file.write(ltw + '\n')

コード例 #16
ファイル: select_params.py プロジェクト: armovetz/hybrid_rs
if ".." not in sys.path:
    sys.path.insert(0, "..")
import misc_functions

# file with events meta descriptions
dataset_dir = "../../data/datasets"
dataset_name = "Timepad/raw"
events_file_name = "events.preprocessed"
events_file = open(dataset_dir + "/" + dataset_name + "/" + events_file_name, 'r')

# file to save new meta lines
meta_numb = 8
#meta_name = "headers"
meta_file = open(dataset_dir + "/" + dataset_name + "/times", 'w')

for line in events_file:
    ltw = ""
    #print line
    #id = misc_functions.getMeta(line, 0)
    #ltw = ltw + str(id) + '\t'
    meta = str( misc_functions.getMeta(line, meta_numb) )
    ltw += meta
    meta_file.write(ltw + '\n')
コード例 #17
ファイル: ndcg.py プロジェクト: armovetz/hybrid_rs
def estimateNDCGp( test_matrix, prediction_matrix, clusters_list, coords, results_file_name ):
        Function estimates accuracy of prediction in each case of 
        cross-validation - it gets results of prediction from 
        "prediction.mtx", test matrix from "test.mtx", and get info
        about clusters from "test_clusters" file.
        Function uses normalized DCGp metrics - it counts error for
        each cluster and get average error between clusters per user
        and then average for all users in current cross-validation case.
    # file stuff: remove after check - we get matrices from call above 
    #prediction_matrix = scipy.io.mmio.mmread(prediction_file_name).tocsr()
    #test_matrix = scipy.io.mmio.mmread("test.mtx").tocsr()
# == DEBUG PRINT =====================================================
    for user_cluster in clusters_list:
        for line in user_cluster:
            print line
        print "Press any key to continue:"
# == \DEBUG PRINT =====================================================
    local_average_nDCGp = float(0.0)
    local_average_p = float(0.0)
    #user_ctr = 0
    for user_cluster in clusters_list:
        #print "user = "******"user_cluster = ", user_cluster
        #print "user_cluster[0] = ", user_cluster[0]
        user = int(((user_cluster[0]).split("\t"))[1])
        #print "user = "******"user_prediction = ", user_prediction
        #print "user_visits = ", user_visits
        user_average_nDCGp = float(0.0)
        user_average_p = float(0.0)
        for byte in user_cluster[1 : ] :
            byte_visits = user_visits[misc_functions.getMeta(byte, 0) : misc_functions.getMeta(byte, 2) + 1]
            byte_predictions = user_prediction[misc_functions.getMeta(byte, 0)  : misc_functions.getMeta(byte, 2) + 1]
            #print "byte_visits = ", byte_visits
            #print "byte_predictions = ", byte_predictions
            # get indices of columns by decreasing of prediction value
            indices = numpy.lexsort(keys = (-byte_predictions, -byte_predictions))
            ideal_indices = numpy.lexsort(keys = (byte_visits, byte_visits))
            #print "indices = ", indices
            #print "ideal_indices = ", ideal_indices
            # sort vector of visits and predictions with help of indices
            sorted_predictions = byte_predictions.take(indices, axis = 0)
            sorted_visits = byte_visits.take(indices, axis = 0)
            #sorted_predictions = numpy.sort(sorted_predictions)
            #sorted_visits = 
            #print "sorted_visits = ", sorted_visits
            #print "sorted_predictions = ", sorted_predictions
            if len(sorted_predictions) != len(sorted_visits):
                raise Exception("visits and prediction clusters have different size")
            #print "sorted_visits[] = ", sorted_visits
            """ TO BE NORMALIZED """
            nDCGp = float(0.0)
            p = len(sorted_predictions)
            for i in range(p):
                if sorted_visits[i] == 1:
                    nDCGp = float(math.log(2, i + 2))
            for i in range(1, p + 1):
                #print "i"
                nDCGp += float(sorted_visits[i - 1]) / float(math.log(i + 1, 2))
            """ / TO BE NORMALIZED """
            #if (nDCGp > 1.0):
                #nDCGp = 1.0
            #print "nDCGp = ", nDCGp
            #if (nDCGp < 0.0):
                #raise Exception("Incorrect nDCGp")
            if (nDCGp < 0.0) or (nDCGp > 1.0):
                print "nDCGp = ", nDCGp
                raise Exception("Incorrect nDCGp")
            #print "nDCGp = ", nDCGp
            user_average_nDCGp += nDCGp
            user_average_p += p
        if (len(user_cluster) != 1):                       # WHY -1??
            user_average_nDCGp /= (len(user_cluster) - 1)  # WHY -1??
            user_average_p /= (len(user_cluster) - 1)
        local_average_nDCGp += user_average_nDCGp
        local_average_p += user_average_p
    local_average_nDCGp /= (len(clusters_list))
    local_average_p /= (len(clusters_list))
    if local_average_nDCGp != 0:
        local_average_position = math.pow(2, 1.0 / local_average_nDCGp)
        local_average_position = "FAR UNKNOWN"
    print "nDCGp for case = ", local_average_nDCGp
    print "average p = ", local_average_p
    print "average position", local_average_position
    results_file = open(results_file_name + ".nDCGp", 'w')
    return [local_average_nDCGp, local_average_p, local_average_position]
コード例 #18
ファイル: ndcg.py プロジェクト: armovetz/hybrid_rs
def estimateNDCGp(test_matrix, prediction_matrix, clusters_list, coords,
        Function estimates accuracy of prediction in each case of 
        cross-validation - it gets results of prediction from 
        "prediction.mtx", test matrix from "test.mtx", and get info
        about clusters from "test_clusters" file.
        Function uses normalized DCGp metrics - it counts error for
        each cluster and get average error between clusters per user
        and then average for all users in current cross-validation case.

    # file stuff: remove after check - we get matrices from call above
    #prediction_matrix = scipy.io.mmio.mmread(prediction_file_name).tocsr()
    #test_matrix = scipy.io.mmio.mmread("test.mtx").tocsr()
# == DEBUG PRINT =====================================================
    for user_cluster in clusters_list:
        for line in user_cluster:
            print line
        print "Press any key to continue:"
# == \DEBUG PRINT =====================================================

    local_average_nDCGp = float(0.0)
    local_average_p = float(0.0)
    #user_ctr = 0
    for user_cluster in clusters_list:
        #print "user = "******"user_cluster = ", user_cluster
        #print "user_cluster[0] = ", user_cluster[0]

        user = int(((user_cluster[0]).split("\t"))[1])
        #print "user = "******"user_prediction = ", user_prediction
        #print "user_visits = ", user_visits

        user_average_nDCGp = float(0.0)
        user_average_p = float(0.0)
        for byte in user_cluster[1:]:
            byte_visits = user_visits[misc_functions.getMeta(byte, 0):
                                      misc_functions.getMeta(byte, 2) + 1]
            byte_predictions = user_prediction[misc_functions.getMeta(
                byte, 0):misc_functions.getMeta(byte, 2) + 1]

            #print "byte_visits = ", byte_visits
            #print "byte_predictions = ", byte_predictions

            # get indices of columns by decreasing of prediction value
            indices = numpy.lexsort(keys=(-byte_predictions,
            ideal_indices = numpy.lexsort(keys=(byte_visits, byte_visits))

            #print "indices = ", indices
            #print "ideal_indices = ", ideal_indices

            # sort vector of visits and predictions with help of indices
            sorted_predictions = byte_predictions.take(indices, axis=0)
            sorted_visits = byte_visits.take(indices, axis=0)
            #sorted_predictions = numpy.sort(sorted_predictions)
            #sorted_visits =

            #print "sorted_visits = ", sorted_visits
            #print "sorted_predictions = ", sorted_predictions

            if len(sorted_predictions) != len(sorted_visits):
                raise Exception(
                    "visits and prediction clusters have different size")

            #print "sorted_visits[] = ", sorted_visits
            """ TO BE NORMALIZED """
            nDCGp = float(0.0)
            p = len(sorted_predictions)
            for i in range(p):
                if sorted_visits[i] == 1:
                    nDCGp = float(math.log(2, i + 2))
            for i in range(1, p + 1):
                #print "i"
                nDCGp += float(sorted_visits[i - 1]) / float(math.log(i + 1, 2))
            """ / TO BE NORMALIZED """

            #if (nDCGp > 1.0):
            #nDCGp = 1.0
            #print "nDCGp = ", nDCGp
            #if (nDCGp < 0.0):
            #raise Exception("Incorrect nDCGp")
            if (nDCGp < 0.0) or (nDCGp > 1.0):
                print "nDCGp = ", nDCGp
                raise Exception("Incorrect nDCGp")

            #print "nDCGp = ", nDCGp
            user_average_nDCGp += nDCGp
            user_average_p += p

        if (len(user_cluster) != 1):  # WHY -1??
            user_average_nDCGp /= (len(user_cluster) - 1)  # WHY -1??
            user_average_p /= (len(user_cluster) - 1)
        local_average_nDCGp += user_average_nDCGp
        local_average_p += user_average_p

    local_average_nDCGp /= (len(clusters_list))
    local_average_p /= (len(clusters_list))

    if local_average_nDCGp != 0:
        local_average_position = math.pow(2, 1.0 / local_average_nDCGp)
        local_average_position = "FAR UNKNOWN"

    print "nDCGp for case = ", local_average_nDCGp
    print "average p = ", local_average_p
    print "average position", local_average_position

    results_file = open(results_file_name + ".nDCGp", 'w')

    return [local_average_nDCGp, local_average_p, local_average_position]