Example #1
0
    def __init__(self, type, mode, cluster, name):

        assert type in ['user', 'session']
        assert mode in ['small', 'local', 'full']

        self.save_path = f'dataset/preprocessed/{cluster}/{mode}/matrices/{type}'
        cf.check_folder(self.save_path)

        self.score_dict = {
            'clickout item': 3,
            'interaction item rating': 3,
            'interaction item info': 1,
            'interaction item image': 3,
            'interaction item deals': 1,
            'search for item': 5,
            'search for destination': 'reset',
            'change of sort order': None,
            'filter selection': None,
            'search for poi': None,
            'tw': 'lin',
            'score_update_rule': 'substitute'
        }

        self.name = name
        self.type = type
        self.mode = mode
        self.cluster = cluster

        self.accomodations_id = data.accomodations_ids()
        self.train_df = None
        self.test_df = None
Example #2
0
def append_missing_accomodations(mode):
    found_ids = []

    joined_df = data.train_df(mode).append(data.test_df(mode))

    # add references if valid
    refs = joined_df.reference
    refs = refs[refs.notnull()].values
    for r in tqdm(refs):
        try:
            v = int(r)
            found_ids.append(v)
        except ValueError:
            continue

    # add impressions
    imprs = joined_df.impressions
    imprs = imprs[imprs.notnull()].values
    for i in tqdm(imprs):
        found_ids.extend(list(map(int, i.split('|'))))

    found_ids = set(found_ids)
    acs = data.accomodations_ids()
    accomod_known = set(map(int, acs))
    missing = found_ids.difference(accomod_known)
    missing_count = len(missing)
    print('Found {} missing accomodations'.format(missing_count))

    del joined_df

    # add those at the end of the dataframe
    if missing_count > 0:
        new_acc_df = pd.DataFrame({'item_id': list(missing)},
                                  columns=['item_id', 'properties'])

        new_acs = data.accomodations_df().append(new_acc_df, ignore_index=True)
        new_acs.to_csv(data.ITEMS_PATH, index=False)
        print('{} successfully updated'.format(data.ITEMS_PATH))
Example #3
0
def urm(mode, cluster, clickout_score=5, impressions_score=1):
    """
    create the URM considering only the clickout_action of every session

    :param train_df:
    :param test_df:
    :param local: operate wether using local or original dataset
    :param clickout_score: score to assign to clickout items
    :param impressions_score: score to assign to impressions accomodations, must be greater than clickout_score
    :param save:
    :return: sparse matrix (sessions, accomodations) and the association dict sessionId-urm_row
    """
    assert clickout_score > impressions_score

    save_path = 'dataset/preprocessed/{}/{}/matrices/'.format(cluster, mode)

    accomodations_array = data.accomodations_ids()

    train_df = data.train_df(mode=mode, cluster=cluster)
    test_df = data.test_df(mode=mode, cluster=cluster)

    train_df = train_df[train_df['action_type'] == 'clickout item'].fillna(-1)
    test_df = test_df[test_df['action_type'] == 'clickout item'].fillna(-1)

    df = pd.concat([train_df,
                    test_df])[['session_id', 'reference', 'impressions']]
    session_groups = df.groupby('session_id')

    session_ids = list(session_groups.groups.keys())

    df_references = session_groups.reference.apply(
        lambda x: list(map(int, x))).reset_index(name='references')

    df_impressions = session_groups.impressions.apply(lambda x: list(
        map(int, x.values[0].split('|')))).reset_index(name='impressions')

    # one hot of references and impressions
    mlb = MultiLabelBinarizer(accomodations_array, sparse_output=True)

    clickout_onehot = mlb.fit_transform(df_references.references)

    impr_onehot = mlb.fit_transform(df_impressions.impressions)

    urm = (clickout_score - impressions_score
           ) * clickout_onehot + impressions_score * impr_onehot

    # create dictionary (k: sessionId - v: urm row)
    row_of_sessionid = {}
    for i in range(len(session_ids)):
        row_of_sessionid[session_ids[i]] = i

    # create dictionary (k: accomodationId - v: urm col)
    col_of_accomodation = {}
    for i in range(len(mlb.classes)):
        col_of_accomodation[mlb.classes[i]] = i

    cf.check_folder(save_path)

    # save all
    print('Saving urm matrix... ')
    sps.save_npz('{}/urm_clickout.npz'.format(save_path), urm)
    print('done!')

    print('Saving row dictionary... ')
    np.save('{}/dict_row.npy'.format(save_path), row_of_sessionid)
    print('done!')

    print('Saving col dictionary... ')
    np.save('{}/dict_col.npy'.format(save_path), col_of_accomodation)
    print('done!')
Example #4
0
def urm_neg_score_user(mode,
                       _last_click_score=1,
                       _clicked_ref_score=1,
                       _impr_not_seen_score=0,
                       _seen_ref_score=1,
                       cluster='no_cluster'):
    global impr_not_seen_score, last_click_score, seen_ref_score, clicked_ref_score
    impr_not_seen_score = _impr_not_seen_score
    last_click_score = _last_click_score
    clicked_ref_score = _clicked_ref_score
    seen_ref_score = _seen_ref_score

    save_path = 'dataset/preprocessed/{}/{}/matrices/'.format(cluster, mode)

    accomodations_array = data.accomodations_ids()

    # load the dataframes according to the mode and cluster
    train_df = data.train_df(mode=mode, cluster=cluster)
    test_df = data.test_df(mode=mode, cluster=cluster)

    # fill missing clickout_item on the test dataframe
    test_df.fillna({'reference': -1}, inplace=True)
    train_df.fillna({'reference': -1}, inplace=True)

    # concatenate the train df and the test df mantaining only the columns of interest
    df = pd.concat([train_df, test_df])[[
        'session_id', 'user_id', 'action_type', 'reference', 'impressions'
    ]]

    session_groups = df.groupby(['user_id'])
    session_ids = list(session_groups.groups.keys())

    rows_count = len(session_groups)
    cols_count = len(accomodations_array)

    # create dictionary (k: sessionId - v: urm row)
    row_of_sessionid = {}
    for i in range(len(session_ids)):
        row_of_sessionid[session_ids[i]] = i

    # create dictionary (k: accomodationId - v: urm col)
    col_of_accomodation = {}
    for i in range(cols_count):
        col_of_accomodation[accomodations_array[i]] = i

    print('dictionaries created\n')

    tqdm.pandas()
    sessions_score = session_groups.progress_apply(
        _session_score_negative_value_seen_elem).values
    print("apply function done\n")

    # create the urm using data indeces and indptr
    _data = []
    indptr = [0]
    indices = []

    values_inserted = 0
    for i in tqdm(range(rows_count)):
        score_dict = sessions_score[i]
        for k in score_dict.keys():
            # TODO: FIND WHY THERE IS A KEY EQUAL -1
            if k != -1:
                indices.append(col_of_accomodation[k])
                _data.append(score_dict[k])
                values_inserted += 1
        indptr.append(values_inserted)
    _urm = sps.csr_matrix((_data, indices, indptr),
                          shape=(rows_count, cols_count))

    print("URM created\n")

    #check if the folder where to save exsist
    cf.check_folder(save_path)

    print('Saving urm matrix... ')
    sps.save_npz('{}/urm_negative_user.npz'.format(save_path), _urm)
    print('done!')

    print('Saving row dictionary... ')
    np.save('{}/dict_row_user.npy'.format(save_path), row_of_sessionid)
    print('done!')

    print('Saving col dictionary... ')
    np.save('{}/dict_col_user.npy'.format(save_path), col_of_accomodation)
    print('done!')
Example #5
0
def urm_session_aware(mode,
                      action_score_dict,
                      cluster='no_cluster',
                      time_weight='lin'):
    """
    Create the URM considering the whole session of a user and giving scores based on its interactions

    :param train_df:
    :param test_df:
    :param time_weight:
    :param save_path:
    :param save:
    :return:
    """
    global tw
    tw = time_weight
    save_path = 'dataset/preprocessed/{}/{}/matrices/'.format(cluster, mode)

    accomodations_array = data.accomodations_ids()

    # load the dataframes according to the mode and cluster
    train_df = data.train_df(mode=mode, cluster=cluster)
    test_df = data.test_df(mode=mode, cluster=cluster)

    # fill missing clickout_item on the test dataframe
    test_df.fillna({'reference': -1}, inplace=True)
    train_df.fillna({'reference': -1}, inplace=True)

    # concatenate the train df and the test df mantaining only the columns of interest
    df = pd.concat([train_df, test_df])[[
        'session_id', 'user_id', 'action_type', 'reference', 'impressions'
    ]]

    session_groups = df.groupby(['session_id', 'user_id'])
    session_ids = list(session_groups.groups.keys())

    rows_count = len(session_groups)
    cols_count = len(accomodations_array)

    # create dictionary (k: sessionId - v: urm row)
    row_of_sessionid = {}
    for i in range(len(session_ids)):
        row_of_sessionid[session_ids[i]] = i

    # create dictionary (k: accomodationId - v: urm col)
    col_of_accomodation = {}
    for i in range(cols_count):
        col_of_accomodation[accomodations_array[i]] = i

    print('dictionaries created\n')

    tqdm.pandas()
    sessions_score = session_groups.progress_apply(
        _compute_session_score).values
    print("apply function done\n")

    # create the urm using data indeces and indptr
    _data = []
    indptr = [0]
    indices = []

    values_inserted = 0
    for i in tqdm(range(rows_count)):
        score_dict = sessions_score[i]
        for k in score_dict.keys():
            indices.append(col_of_accomodation[k])
            _data.append(score_dict[k])
            values_inserted += 1
        indptr.append(values_inserted)
    _urm = sps.csr_matrix((_data, indices, indptr),
                          shape=(rows_count, cols_count))

    print("URM created\n")

    #check if the folder where to save exsist
    cf.check_folder(save_path)

    print('Saving urm matrix... ')
    sps.save_npz('{}/urm_session_aware1_{}.npz'.format(save_path, time_weight),
                 _urm)
    print('done!')

    print('Saving row dictionary... ')
    np.save('{}/dict_row.npy'.format(save_path), row_of_sessionid)
    print('done!')

    print('Saving col dictionary... ')
    np.save('{}/dict_col.npy'.format(save_path), col_of_accomodation)
    print('done!')