Example #1
0
def test_build_features():

    users, items = 10, 100

    dataset = Dataset(user_identity_features=False, item_identity_features=False)
    dataset.fit(
        range(users),
        range(items),
        ["user:{}".format(x) for x in range(users)],
        ["item:{}".format(x) for x in range(items)],
    )

    # Build from lists
    user_features = dataset.build_user_features(
        [
            (user_id, ["user:{}".format(x) for x in range(users)])
            for user_id in range(users)
        ]
    )
    assert user_features.getnnz() == users ** 2

    item_features = dataset.build_item_features(
        [
            (item_id, ["item:{}".format(x) for x in range(items)])
            for item_id in range(items)
        ]
    )
    assert item_features.getnnz() == items ** 2

    # Build from dicts
    user_features = dataset.build_user_features(
        [
            (user_id, {"user:{}".format(x): float(x) for x in range(users)})
            for user_id in range(users)
        ],
        normalize=False,
    )

    assert np.all(user_features.todense() == np.array([list(range(users))] * users))

    item_features = dataset.build_item_features(
        [
            (item_id, {"item:{}".format(x): float(x) for x in range(items)})
            for item_id in range(items)
        ],
        normalize=False,
    )

    assert np.all(item_features.todense() == np.array([list(range(items))] * items))

    # Test normalization
    item_features = dataset.build_item_features(
        [
            (item_id, {"item:{}".format(x): float(x) for x in range(items)})
            for item_id in range(items)
        ]
    )

    assert np.all(item_features.sum(1) == 1.0)
Example #2
0
def test_build_features():

    users, items = 10, 100

    dataset = Dataset(user_identity_features=False, item_identity_features=False)
    dataset.fit(
        range(users),
        range(items),
        ["user:{}".format(x) for x in range(users)],
        ["item:{}".format(x) for x in range(items)],
    )

    # Build from lists
    user_features = dataset.build_user_features(
        [(user_id, ["user:{}".format(x) for x in range(users)]) for user_id in range(users)]
    )
    assert user_features.getnnz() == users ** 2

    item_features = dataset.build_item_features(
        [(item_id, ["item:{}".format(x) for x in range(items)]) for item_id in range(items)]
    )
    assert item_features.getnnz() == items ** 2

    # Build from dicts
    user_features = dataset.build_user_features(
        [
            (user_id, {"user:{}".format(x): float(x) for x in range(users)})
            for user_id in range(users)
        ],
        normalize=False,
    )

    assert np.all(user_features.todense() == np.array([list(range(users))] * users))

    item_features = dataset.build_item_features(
        [
            (item_id, {"item:{}".format(x): float(x) for x in range(items)})
            for item_id in range(items)
        ],
        normalize=False,
    )

    assert np.all(item_features.todense() == np.array([list(range(items))] * items))

    # Test normalization
    item_features = dataset.build_item_features(
        [
            (item_id, {"item:{}".format(x): float(x) for x in range(items)})
            for item_id in range(items)
        ]
    )

    assert np.all(item_features.sum(1) == 1.0)
Example #3
0
def interactions(df):
    movie_genre = [x.split("|") for x in df["genre"]]
    all_movie_genre = sorted(
        list(set(itertools.chain.from_iterable(movie_genre))))

    all_occupations = sorted(list(set(df["occupation"])))

    dataset = Dataset()
    dataset.fit(
        df["userID"],
        df["itemID"],
        item_features=all_movie_genre,
        user_features=all_occupations,
    )

    item_features = dataset.build_item_features(
        (x, y) for x, y in zip(df.itemID, movie_genre))

    user_features = dataset.build_user_features(
        (x, [y]) for x, y in zip(df.userID, df["occupation"]))

    (interactions, _) = dataset.build_interactions(df.iloc[:, 0:3].values)

    train_interactions, test_interactions = cross_validation.random_train_test_split(
        interactions,
        test_percentage=TEST_PERCENTAGE,
        random_state=np.random.RandomState(SEEDNO),
    )
    return train_interactions, test_interactions, item_features, user_features
Example #4
0
def prepareData(df, tags):
    df = df[df.actionCategory == "WebNei clicked"]
    actionByUsers = df.groupby(["userName", "actionName"]).size()
    uniqueUsers = df[df.userName.isin(
        actionByUsers.index.get_level_values(
            0).unique().values)].drop_duplicates('userName')
    uniqueUsers['user_features'] = uniqueUsers[[
        'title', 'team', 'organization', 'department'
    ]].values.tolist()
    dataset = Dataset()
    dataset.fit((list(actionByUsers.index.get_level_values(0))),
                (list(actionByUsers.index.get_level_values(1))))

    rowM, colM = prepareJson(tags)
    rowU, colU = prepareUserFeatures(uniqueUsers)

    dataset.fit_partial(items=rowM,
                        item_features=colM,
                        users=rowU,
                        user_features=colU)

    (interactions, weights) = dataset.build_interactions(
        zip(list(actionByUsers.index.get_level_values(0)),
            list(actionByUsers.index.get_level_values(1))))
    item_features = dataset.build_item_features(zip(rowM, [colM]))
    user_features = dataset.build_user_features(zip(rowU, [colU]))
    return interactions, item_features, user_features
    def obtener_matrices(self):
        """
        Método obtener_matrices. Obtiene las matrices necesarias para la creación de los modelos de LightFM.

        Este método solo se utiliza en la interfaz de texto.
        """

        global train, test, modelo, item_features, user_features

        # Se obtienen los dataframes
        Entrada.obtener_datos()
        ratings_df = Entrada.ratings_df
        users_df = Entrada.users_df
        items_df = Entrada.items_df

        # Se transforman los dataframes en matrices que puedan ser utilzadas por los modelos
        dataset = Dataset()
        dataset.fit(users_df[users_df.columns.values[0]],
                    items_df[items_df.columns.values[0]],
                    user_features=users_df[users_df.columns.values[1]],
                    item_features=items_df[items_df.columns.values[1]])

        # Si el modelo es colaborativo o híbrido se tienen en cuenta las valoraciones de los usuarios
        if self.opcion_modelo == 1 or self.opcion_modelo == 2:
            (interacciones, pesos) = dataset.build_interactions(
                (row[ratings_df.columns.values[0]],
                 row[ratings_df.columns.values[1]],
                 row[ratings_df.columns.values[2]])
                for index, row in ratings_df.iterrows())
        else:
            (interacciones, pesos) = dataset.build_interactions(
                (row[ratings_df.columns.values[0]],
                 row[ratings_df.columns.values[1]])
                for index, row in ratings_df.iterrows())

        # Se obtienen las matrices de features y se guardan
        item_features = dataset.build_item_features(
            (row[items_df.columns.values[0]],
             [row[items_df.columns.values[1]]])
            for index, row in items_df.iterrows())
        user_features = dataset.build_user_features(
            (row[users_df.columns.values[0]],
             [row[users_df.columns.values[1]]])
            for index, row in users_df.iterrows())
        print("Guarda la matriz de item features")
        guardar_datos_pickle(item_features, 'la matriz de item features')
        print("Guarda la matriz de user features")
        guardar_datos_pickle(user_features, 'la matriz de user feautures')

        # Se dividen las interacciones en conjuntos de entrenamiento y test y se guardan
        train, test = random_train_test_split(interacciones,
                                              test_percentage=0.2)
        print("Guarda la matriz de entrenamiento")
        guardar_datos_pickle(train, 'la matriz de entrenamiento')
        print("Guarda la matriz de test")
        guardar_datos_pickle(test, 'la matriz de test')
Example #6
0
def test_fitting_no_identity():

    users, items = 10, 100

    dataset = Dataset(user_identity_features=False, item_identity_features=False)
    dataset.fit(range(users), range(items))

    assert dataset.interactions_shape() == (users, items)
    assert dataset.user_features_shape() == (users, 0)
    assert dataset.item_features_shape() == (items, 0)

    assert dataset.build_interactions([])[0].shape == (users, items)
    assert dataset.build_user_features([], normalize=False).getnnz() == 0
    assert dataset.build_item_features([], normalize=False).getnnz() == 0
Example #7
0
def test_fitting():

    users, items = 10, 100

    dataset = Dataset()
    dataset.fit(range(users), range(items))

    assert dataset.interactions_shape() == (users, items)
    assert dataset.user_features_shape() == (users, users)
    assert dataset.item_features_shape() == (items, items)

    assert dataset.build_interactions([])[0].shape == (users, items)
    assert dataset.build_user_features([]).getnnz() == users
    assert dataset.build_item_features([]).getnnz() == items
Example #8
0
def test_fitting_no_identity():

    users, items = 10, 100

    dataset = Dataset(user_identity_features=False, item_identity_features=False)
    dataset.fit(range(users), range(items))

    assert dataset.interactions_shape() == (users, items)
    assert dataset.user_features_shape() == (users, 0)
    assert dataset.item_features_shape() == (items, 0)

    assert dataset.build_interactions([])[0].shape == (users, items)
    assert dataset.build_user_features([], normalize=False).getnnz() == 0
    assert dataset.build_item_features([], normalize=False).getnnz() == 0
Example #9
0
def test_fitting():

    users, items = 10, 100

    dataset = Dataset()
    dataset.fit(range(users), range(items))

    assert dataset.interactions_shape() == (users, items)
    assert dataset.user_features_shape() == (users, users)
    assert dataset.item_features_shape() == (items, items)

    assert dataset.build_interactions([])[0].shape == (users, items)
    assert dataset.build_user_features([]).getnnz() == users
    assert dataset.build_item_features([]).getnnz() == items
Example #10
0
    def fit_data(self, matrix, user_features=None, item_features=None):
        """
        Create datasets for .fit() method.
        Args:
            matrix: User-item interactions matrix (weighted)
            user_features: User-features pandas dataframe which index contains user_ids (crd_no)
            item_features:  Item-features pandas dataframe which index contains good_ids (plu_id)
        Returns:
            Model with fitted (mapped) datasets
        """
        matrix.sort_index(inplace=True)
        matrix.sort_index(inplace=True, axis=1)
        dataset = Dataset()
        dataset.fit((x for x in matrix.index), (x for x in matrix.columns))
        interactions = pd.melt(
            matrix.replace(0, np.nan).reset_index(),
            id_vars='index',
            value_vars=list(matrix.columns[1:]),
            var_name='plu_id',
            value_name='rating').dropna().sort_values('index')
        interactions.columns = ['crd_no', 'plu_id', 'rating']
        self.interactions, self.weights = dataset.build_interactions(
            [tuple(x) for x in interactions.values])

        if user_features is not None:
            user_features.sort_index(inplace=True)
            dataset.fit_partial(users=user_features.index,
                                user_features=user_features)
            self.user_features = dataset.build_user_features(
                ((index, dict(row))
                 for index, row in user_features.iterrows()))
        else:
            self.user_features = None
        if item_features is not None:
            item_features.sort_index(inplace=True)
            dataset.fit_partial(items=item_features.index,
                                item_features=item_features)
            self.item_features = dataset.build_item_features(
                ((index, dict(row))
                 for index, row in item_features.iterrows()))
        else:
            self.item_features = None
def create_datasets(cluster_id):

    events_list = get_events_from_es(cluster_id)

    dataframe_interactions, dataframe_users_features, dataframe_item_features, user_tuple, item_tuple = create_interactions_and_features(events_list, cluster_id)

    print(dataframe_interactions, cluster_id, file=sys.stderr)
    print(dataframe_users_features, cluster_id, file=sys.stderr)
    print(dataframe_item_features, cluster_id, file=sys.stderr)

    #print(user_tuple)
   # print(item_tuple)

    user_features = format_users_features(dataframe_users_features)

    #print(user_features)

    item_features = format_items_features(dataframe_item_features)

    #print(item_features)

    dataset = Dataset()

    dataset.fit(
            dataframe_interactions['user'].unique(), # all the users
            dataframe_interactions['item'].unique(), # all the items
            user_features = user_features,
            item_features = item_features
    )

    (interactions, weights) = dataset.build_interactions([(x[0], x[1], x[2]) for x in dataframe_interactions.values ])

#    print(interactions)
#    print(weights)

    final_user_features = dataset.build_user_features(user_tuple, normalize= False)

    final_item_features = dataset.build_item_features(item_tuple, normalize= False)

    return dataset, interactions, weights, final_item_features, final_user_features
Example #12
0
ratings = pd.read_csv('ratings.txt', sep=';', header=None)

from lightfm.data import Dataset

dataset = Dataset(user_identity_features=True, item_identity_features=True)
dataset.fit(users=(users[50].unique()),
            items=(items[0]),
            item_features=list(range(2, 10)),
            user_features=list(range(2, 50)))

items_features_raw = list(
    (item[1], (np.argwhere(np.array(item[3:]) == 1)[0] + 2).tolist())
    for item in items.itertuples())
items_features = dataset.build_item_features(items_features_raw)
users_features_raw = build_user_dict(users)
users_features = dataset.build_user_features(users_features_raw)

num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))

ratings2 = ratings[ratings[2] > 0]
ratings2 = ratings2.drop_duplicates(subset=[1, 2, 3])
train, test = train_test_split(ratings2, test_size=0.1)
print(train.shape)
print(test.shape)

(train_interactions,
 train_weights) = dataset.build_interactions(train[[3, 1]].values)
(test_interactions, test_weights) = dataset.build_interactions(test[[3, 1
                                                                     ]].values)
Example #13
0
    print('[ %04ds ] Files loaded' % (time.time() - start_time))

    all_user_features = ['NO_FEAT']
    all_business_features = Business.collect_business_features(business_stats)

    all_user_ids = User.extract_user_ids(user_stats)
    all_business_ids = Business.extract_business_ids(business_stats)

    dataset = Dataset()
    dataset.fit(all_user_ids,
                all_business_ids,
                user_features=all_user_features,
                item_features=all_business_features)

    user_features = dataset.build_user_features(
        User.build_user_features(user_stats, all_user_ids), True)

    business_features = dataset.build_item_features(
        Business.build_business_features(business_stats, all_business_ids),
        True)

    print('[ %04ds ] Dataset initialized' % (time.time() - start_time))

    user_avg, user_std = Review.extract_user_average_and_std(training_set)
    normalized_training_reviews = Review.normalize_by_user(
        training_set, user_avg)
    training_interactions = Review.extract_sparse_interaction_matrix(
        normalized_training_reviews)
    training_user_ids = Review.extract_user_ids(normalized_training_reviews)
    training_business_ids = Review.extract_business_ids(
        normalized_training_reviews)
Example #14
0
            interactions.idaviso.values,
            interactions.rating.values
        ],
        dtype=np.object).T

    res_interactions, res_weights = lfm_dataset.build_interactions(data=interactions)

    print(50 * '-')
    print('Building User Features...')
    print(50 * '-')

    users_features = np.array([users.idpostulante.values, users[['edad', 'sexo', 'educacion']].values.tolist()], dtype=np.object).T
    print(users_features)
    print(50*'-')

    u_feat = lfm_dataset.build_user_features(data=users_features, normalize=False)


    print(50 * '-')

    print('Matrix factorization model...')
    mf_model = rs.runMF(interactions=res_interactions,
                        n_components=30,
                        loss='warp',
                        k=15,
                        epoch=50,
                        n_jobs=24)
    print(50 * "-")
    print('Building Recommendations...')

    users = pd.Series(data=list(u_map.keys()), name='idpostulante')
Example #15
0
# fit dataset
dataset.fit(users=user_iterable,
            items=iteam_iterable,
            user_features=user_feature_names,
            item_features=item_feature_names
            )

# check shape
num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items: {}.'.format(num_users, num_items))
_, num_users_feature = dataset.user_features_shape()
_, num_items_feature = dataset.item_features_shape()
print('Num users feature: {}, num_items feature: {}.'.format(num_users_feature, num_items_feature))

# build user feature matrix
user_feature_matrix = dataset.build_user_features(user_feature_iterable, normalize=True)

# build item feature matrix
item_feature_matrix = dataset.build_item_features(item_feature_iterable, normalize=True)

# build interaction
(train_interactions, weights) = dataset.build_interactions(data=((row['userCode'], row['project_id'], row[interaction_col_name])for index, row in train.iterrows() if row['project_id'] not in ignore_project))

from lightfm import LightFM

model = LightFM(loss='warp', random_state=44, learning_schedule='adagrad')
model.fit(train_interactions,
        item_features=item_feature_matrix,
        user_features=user_feature_matrix,
        )
Example #16
0
def train_model(
               df, user_id_col='user_id', item_id_col='business_id',
               item_name_col='name_business', evaluate=True):
    """ Train the model using collaborative filtering.
    Args:
        df: the input dataframe.
        user_id_col: user id column.
        item_id_col: item id column.
        item_name_col: item name column.
        evaluate: if evaluate the model performance.
    Returns:
        model_full: the trained model.
        df_interactions: dataframe with user-item interactions.
        user_dict: user dictionary containing user_id as key and
            interaction_index as value.
        item_dict: item dictionary containing item_id as key and
            item_name as value.
        user_feature_map: the feature map of users
        business_feature_map: the feature map of items
    """
    if evaluate:
        print('Evaluating model...')
        evaluate_model(df, user_id_col='user_id', item_id_col='business_id')
    print('Training model...')

    # build recommendations for known users and known businesses
    # with collaborative filtering method
    ds_full = Dataset()
    # we call fit to supply userid, item id and user/item features
    user_cols = ['user_id', 'average_stars']
    categories = [c for c in df.columns if c[0].isupper()]
    item_cols = ['business_id', 'state']

    for i in df.columns[10:]:
        item_cols.append(str(i))

    user_features = user_cols[1:]
    item_features = item_cols[2:]

    ds_full.fit(
        df[user_id_col].unique(),  # all the users
        df[item_id_col].unique(),  # all the items
        user_features=user_features,  # additional user features
        item_features=item_features
         )

    df_users = df.drop_duplicates(user_id_col)
    # df_users = df[df.duplicated(user_id_col) == False]
    users_features = []
    for i in range(len(df_users)):
        users_features.append(get_users_features_tuple(df_users.values[i]))
    users_features = ds_full.build_user_features(
        users_features, normalize=False)

    items = df.drop_duplicates(item_id_col)
    # items = df[df.duplicated(item_id_col) == False]
    items_features = []
    for i in range(len(items)):
        items_features.append(get_items_features_tuple(
            items.values[i], categories))
    items_features = ds_full.build_item_features(
        items_features, normalize=False)

    (interactions, weights) = ds_full.build_interactions(
        [(x[0], x[1], x[2]) for x in df.values])
    # model
    model_full = LightFM(
        no_components=100, learning_rate=0.05, loss='warp', max_sampled=50)
    model_full.fit(
        interactions, user_features=users_features,
        item_features=items_features, sample_weight=weights,
        epochs=10, num_threads=10)
    # mapping
    user_id_map, user_feature_map, business_id_map, business_feature_map = \
        ds_full.mapping()

    # data preparation
    df_interactions = pd.DataFrame(weights.todense())
    df_interactions.index = list(user_id_map.keys())
    df_interactions.columns = list(business_id_map.keys())
    user_dict = user_id_map
    item_dict = df.set_index(item_id_col)[item_name_col].to_dict()
    return model_full, df_interactions, user_dict, \
        item_dict, user_feature_map, business_feature_map
Example #17
0
def lambda_handler(event, context):
    try:
        ## Fetch data from RDS code
        connection = pymysql.connect(
            host='fitbookdb.crm91a2epcbi.us-east-1.rds.amazonaws.com',
            user='******',
            passwd='postgres',
            db='fitbookdb',
            cursorclass=pymysql.cursors.DictCursor)

        print("Connection successful")
    except:
        print("Connection error")

    # In[3]:

    #Get Food DataFrame
    dict_list = []

    with connection.cursor() as cur:
        cur.execute("select * from food_dataset")
        for row in cur:
            dict_list.append(row)

    food_rds_df = pd.DataFrame(dict_list)
    food_df = food_rds_df.copy()
    food_df.drop([
        'Portion_Default', 'Portion_Amount', 'Factor', 'Increment',
        'Multiplier', 'Portion_Display_Name', 'Food_Code', 'Display_Name'
    ],
                 axis=1,
                 inplace=True)
    # food_df.head()
    print('Food Dataframe imported')

    # In[4]:

    # # TODO: Perform Binning
    # food_30_bins = ['Alcohol', 'Calories', 'Saturated_Fats']
    # for each_column in food_30_bins:
    #     bins = np.linspace(food_df[each_column].min(), food_df[each_column].max(), 30)
    #     food_df[each_column+'bin'] = pd.cut(food_df[each_column], bins, labels=np.arange(0,len(bins)-1))
    # food_df

    # In[5]:

    # for each_column in food_30_bins:
    #     print(food_df[each_column].min())

    # In[6]:

    #Get User Dataframe
    # user_df = pd.read_csv('user_db_try.csv')
    # user_df.head()

    dict_list = []

    with connection.cursor() as cur:
        cur.execute("select * from tblUserData")
        for row in cur:
            dict_list.append(row)

    user_rds_df = pd.DataFrame(dict_list)
    user_df = user_rds_df.copy()
    user_df.drop([
        'cognitoAccessToken', 'cognitoIDToken', 'cognitoRefreshToken',
        'fitbitAccessToken', 'fitbitUserID', 'userName'
    ],
                 axis=1,
                 inplace=True)
    # user_df.head()

    print('User Dataframe imported')

    # In[7]:

    #Get userItem DataFrame
    # userItem_df = pd.read_csv('userItem_db_try_new.csv')
    # userItem_df.head()

    dict_list = []

    with connection.cursor() as cur:
        cur.execute("select * from tblUserRating")
        for row in cur:
            dict_list.append(row)

    userItem_rds_df = pd.DataFrame(dict_list)
    userItem_df = userItem_rds_df.copy()
    # userItem_df.head()
    print('UserItem Dataframe imported')

    # In[8]:

    #Make all the feature values unique
    for column_name in food_df.columns:
        if column_name != 'food_ID':
            food_df[column_name] = str(
                column_name) + ":" + food_df[column_name].astype(str)
    # food_df.head()

    # In[9]:

    #This Dict will be useful while creating tupples
    food_features_df = food_df.drop(['food_ID'], axis=1).copy()
    food_features_dict = food_features_df.to_dict('split')
    # food_features_dict

    # In[10]:

    food_feature_values = []

    for column_name in food_features_df.columns:
        food_feature_values.extend(food_features_df[column_name].unique())

    # food_feature_values

    # In[11]:

    for column_name in user_df.columns:
        if column_name != 'userID':
            user_df[column_name] = str(
                column_name) + ":" + user_df[column_name].astype(str)

    user_features_df = user_df.drop(['userID'], axis=1).copy()

    user_features_dict = user_features_df.to_dict('split')
    # user_features_dict

    # In[12]:

    user_feature_values = []

    for column_name in user_features_df.columns:
        user_feature_values.extend(user_features_df[column_name].unique())

    # user_feature_values

    # In[13]:

    user_tuples = []
    food_tuples = []

    for index, row in user_df.iterrows():
        user_tuples.append((row['userID'], user_features_dict['data'][index]))

    for index, row in food_df.iterrows():
        food_tuples.append((row['food_ID'], food_features_dict['data'][index]))

    # food_tuples

    # In[14]:

    print("Creating LightFm dataset")
    dataset = Dataset()
    dataset.fit(users=(user_id for user_id in user_df['userID']),
                items=(food_id for food_id in food_df['food_ID']))

    print("Dataset Created")
    # In[15]:

    num_users, num_items = dataset.interactions_shape()
    print('Num users: {}, num_items {}.'.format(num_users, num_items))

    # In[16]:

    # dataset.fit_partial(items=(food_id for food_id in food_df['Food_Code']),
    #                            item_features=((each_feature for each_feature in food_features)for food_features in food_features_dict['data']))

    # In[17]:

    # dataset.fit_partial(items=(food_id for food_id in food_df['Food_Code']),
    #                            item_features=((row['Milk'], row['Meats'], row['Alcohol'], row['Calories'])for index,row in food_df.iterrows()))

    # In[18]:

    print("fittng item partial features")
    dataset.fit_partial(items=(food_id for food_id in food_df['food_ID']),
                        item_features=(each_value
                                       for each_value in food_feature_values))

    # In[19]:

    # dataset.fit_partial(users=(user_id for user_id in user_df['Id']),
    #                     user_features=((each_feature for each_feature in user_features)for user_features in user_features_dict['data']))

    # In[20]:
    print("fittng user partial features")

    dataset.fit_partial(users=(user_id for user_id in user_df['userID']),
                        user_features=(each_value
                                       for each_value in user_feature_values))

    # In[21]:

    # dataset.item_features_shape()
    # dataset.user_features_shape()

    # In[22]:

    print("Building Interactions")
    (interactions, weights) = dataset.build_interactions(
        ((x['userID'], x['food_ID'], x['rating'])
         for y, x in userItem_df.iterrows()))

    # print(repr(interactions))
    # print(weights)

    # In[23]:

    # interactions.shape

    # In[24]:

    print("Building item features")
    item_features = dataset.build_item_features(each_tuple
                                                for each_tuple in food_tuples)
    # print(item_features)

    # In[25]:

    user_features = dataset.build_user_features(each_tuple
                                                for each_tuple in user_tuples)
    # print(user_features)

    # In[26]:

    print("Fitting Model")
    model = LightFM(loss='warp')
    model.fit(interactions,
              item_features=item_features,
              user_features=user_features)

    print("Model trained!!")

    print("Pickle started!!")
    pickle.dump(model, open("/tmp/model.pkl", 'wb'), protocol=2)

    bucketName = "fitbook-lambda-packages"
    Key = "/tmp/model.pkl"
    outPutname = "model.pkl"

    print("Uploading to S3")
    s3 = boto3.client('s3')
    s3.upload_file(Key, bucketName, outPutname)
    print("Upload done")
    os.remove("/tmp/model.pkl")

    print("Pickle file deleted")
    print("Successssss!!!!!")
Example #18
0
             'writer_name']].drop_duplicates().reset_index(drop=True)
users = uid[['uid', 'popular_section', 'popular_platform',
             'popular_sources']].drop_duplicates()

dataset = Dataset()
features_list = create_feature_list(items,
                                    cols=['section_primary', 'writer_name'])
user_features_list = create_feature_list(
    users, cols=['popular_section', 'popular_platform', 'popular_sources'])

#features_list = list(set(items.writer_name.to_list()))
dataset.fit(users=uid.uid.unique(),
            items=uid.article_id.unique(),
            item_features=features_list,
            user_features=user_features_list)

(interactions, weights) = dataset.build_interactions(
    (x.uid, x.article_id) for x in uid.itertuples())
n_users, n_items = interactions.shape
1 - (interactions.getnnz() / (interactions.shape[0] * interactions.shape[1]))
item_features = dataset.build_item_features([
    (i.article_id, [i.section_primary, i.writer_name])
    for i in items.itertuples()
])
user_features = dataset.build_user_features([(u.uid, [u.popular_section])
                                             for u in users.itertuples()])

item_features = dataset.build_item_features(build_features(items))
user_features = dataset.build_user_features(build_features(users))
user_id_map, user_feature_map, item_id_map, item_feature_map = dataset.mapping(
)
def model(df, params, u=None, i=None):
    state = np.random.RandomState(params['seed'])
    data = Dataset()
    data.fit(df['userID'].unique(),
             df['poiID'].unique(),
             user_features=u[1] if u is not None else None,
             item_features=i[1] if i is not None else None)

    if u is not None:
        user_features_iterable = map(lambda l: (l[0], l[1]), u[0].iteritems())
        user_features = data.build_user_features(user_features_iterable,
                                                 normalize=False)
    else:
        user_features = None

    if i is not None:
        item_features_iterable = map(lambda l: (l[0], [l[1]]),
                                     i[0].iteritems())
        item_features = data.build_item_features(item_features_iterable,
                                                 normalize=False)
    else:
        item_features = None

    ratings, weights = data.build_interactions(df[['userID', 'poiID'
                                                   ]].itertuples(index=False,
                                                                 name=None))

    train, test = random_train_test_split(ratings,
                                          test_percentage=params['test'],
                                          random_state=state)

    lfm = LightFM(no_components=params['f'],
                  learning_rate=params['lr'],
                  loss=params['loss'],
                  user_alpha=params['alpha'],
                  random_state=state)
    lfm.fit(train,
            epochs=params['epochs'],
            user_features=user_features,
            item_features=item_features)

    return {
        'pr-train':
        100.0 * precision_at_k(lfm,
                               train,
                               k=params['k'],
                               user_features=user_features,
                               item_features=item_features).mean(),
        'mrr-train':
        100.0 * reciprocal_rank(lfm,
                                train,
                                user_features=user_features,
                                item_features=item_features).mean(),
        'pr-test':
        100.0 * precision_at_k(lfm,
                               test,
                               k=params['k'],
                               user_features=user_features,
                               item_features=item_features).mean(),
        'mrr-test':
        100.0 * reciprocal_rank(lfm,
                                test,
                                user_features=user_features,
                                item_features=item_features).mean()
    }
Example #20
0
dataset = Dataset()
dataset.fit((x[0] for i, x in users.iterrows()),
            (x[1] for i, x in users.iterrows()))
# (interactions, weights) = dataset.build_interactions((x[0],x[1]) for i,x in users.iterrows())
dataset.fit_partial(items=(x['destinationid']
                           for i, x in destinations.iterrows()),
                    item_features=(x['Destination-tf-idf']
                                   for i, x in destinations.iterrows()))
dataset.fit_partial(items=(x['userid'] for i, x in users.iterrows()),
                    user_features=(x['age'] for i, x in users.iterrows()))

item_features = dataset.build_item_features(
    ((x['destinationid'], [x['Destination-tf-idf']])
     for i, x in destinations.iterrows()))
user_features = dataset.build_user_features(
    ((x['userid'], [x['age']]) for i, x in users.iterrows()))

mf_model = runMF(interactions=interactions,
                 item_features=item_features,
                 user_features=user_features,
                 n_components=30,
                 loss='warp',
                 epoch=30,
                 n_jobs=4)


def get_all_users():
    return user_all_records


def sample_recommendation_user_1(user_id):
Example #21
0
# fit the dataset to create mappings for users, items and respective features
dataset.fit((x['_id'] for x in full_users), (x['_id'] for x in locations_data),
            user_features=interestList,
            item_features=subCatList)

# build interactions from ratings data, 'liked' places only at the moment (~350 only)
interactions = dataset.build_interactions(
    ((x['userId'], x['place'], x['weight']) for x in full_ratings))
print(repr(interactions[0]))

# print number of users and items
num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))

# buil user features from users interests
user_features = dataset.build_user_features(
    ((x['_id'], x['interests']) for x in full_users), normalize=False)

# buil item features from users iterests
item_features = dataset.build_item_features(
    ((x['_id'], x['subCategory']) for x in locations_data), normalize=False)
# print(repr(item_features))

with open('data.json', 'w') as outfile:
    json.dump(dataset.mapping(), outfile)

model = LightFM(loss='warp', no_components=20)
model.fit(interactions[0],
          user_features=user_features,
          item_features=item_features)

train_auc = auc_score(model,
Example #22
0
dataset.fit_partial(users=(x['User-ID'] for x in get_user_features()),
                    items=(x['ISBN'] for x in get_book_features()),
                    item_features=(x['Book-Author'] for x in get_book_features()),
                    user_features=(x['Age'] for x in get_user_features()))

(interactions, weights) = dataset.build_interactions(((x['User-ID'], x['ISBN'])
                                                      for x in get_ratings()))

#print(repr(interactions))

item_features = dataset.build_item_features(((x['ISBN'], [x['Book-Author']])
                                              for x in get_book_features()))
#print(repr(item_features))


user_features = dataset.build_user_features(((x['User-ID'], [x['Age']])
                                              for x in get_user_features()))


labels = np.array([x['ISBN'] for x in get_ratings()])

#################################
#								#
#  		Training the Model 		#
#								#
#################################

model = LightFM(loss='warp')

(train, test) = random_train_test_split(interactions=interactions, test_percentage=0.2)

model.fit(train, item_features=item_features, user_features=user_features, epochs=2)
Example #23
0
def main():

	if request.method == 'POST':
		global df_movies
		# global top_trending_ids
		# print(list(df_movies[df_movies.movie_id_ml.isin(top_trending_ids)].title) )
		print(request.form)
		# Get recommendations!
		if 'run-mf-model' in request.form:
			
			for i, user_rating in enumerate(session['arr']):
				session['arr'][i] = user_rating[:-2]
			session['movieIds'] = session['movieIds'][:-2]
			rated_movies = min(len(session['arr'][0]), len(session['movieIds']))
			for i, user_rating in enumerate(session['arr']):
				session['arr'][i] = user_rating[:rated_movies]
			session['movieIds'] = session['movieIds'][:rated_movies]

			pu = recommendation_mf(session['arr'], session['members'], session['movieIds'])


			session.clear()
			top_trending_ids = list(df_movies.sort_values(by="trending_score").head(200).sample(15).movie_id_ml)
			session['counter'] = 0
			session['members'] = 0
			session['userAges'] = []
			session['userGenders'] = []
			session['movieIds'] = list(df_movies[df_movies.movie_id_ml.isin(top_trending_ids)].movie_id_ml)
			session['top15'] = list(df_movies[df_movies.movie_id_ml.isin(top_trending_ids)].title) 
			session['top15_posters'] = list(df_movies[df_movies.movie_id_ml.isin(top_trending_ids)].poster_url)
			session['arr'] = None
			return(render_template('main.html', settings = {'friendsInfo':False, 'showVote': False, 'people': 0, 'buttonDisable': False,'chooseRecommendation':False, 'recommendation': pu}))
		
		if 'run-siamese-model' in request.form:
			# global df
			global friends
			global ratings
			global new_friend_id
			new_ratings = []
			for mid, movie_real_id in enumerate(session['movieIds']):
				avg_mv_rating = np.median(np.array([user_ratings[mid] for user_ratings in session['arr']]))
				new_ratings.append({'movie_id_ml':movie_real_id, 
									'rating': avg_mv_rating,
									'friend_id': new_friend_id}) 
			new_friend = {'friend_id': new_friend_id, 'friends_age': np.mean(np.array(session['userAges'])), 'friends_gender': np.mean(np.array(session['userGenders']))}	

			friends.append(new_friend)
			ratings.extend(new_ratings)

			dataset = LightFMDataset()
			item_str_for_eval = "x['title'],x['release'], x['unknown'], x['action'], x['adventure'],x['animation'], x['childrens'], x['comedy'], x['crime'], x['documentary'], x['drama'],  x['fantasy'], x['noir'], x['horror'], x['musical'],x['mystery'], x['romance'], x['scifi'], x['thriller'], x['war'], x['western'], *soup_movie_features[x['soup_id']]"
			friend_str_for_eval = "x['friends_age'], x['friends_gender']"

			dataset.fit(users=(int(x['friend_id']) for x in friends),
						items=(int(x['movie_id_ml']) for x in movies),
						item_features=(eval("("+item_str_for_eval+")") for x in movies),
						user_features=((eval(friend_str_for_eval)) for x in friends))
			num_friends, num_items = dataset.interactions_shape()
			print(f'Num friends: {num_friends}, num_items {num_items}. {datetime.datetime.now()}')

			(interactions, weights) = dataset.build_interactions(((int(x['friend_id']), int(x['movie_id_ml']))
													  for x in ratings))
			item_features = dataset.build_item_features(((x['movie_id_ml'], 
											  [eval("("+item_str_for_eval+")")]) for x in movies) )
			user_features = dataset.build_user_features(((x['friend_id'], 
											  [eval(friend_str_for_eval)]) for x in friends) )

			print(f"Item and User features created {datetime.datetime.now()}")

			epochs = 50 #150
			lr = 0.015
			max_sampled = 11

			loss_type = "warp"  # "bpr"


			model = LightFM(learning_rate=lr, loss=loss_type, max_sampled=max_sampled)

			model.fit_partial(interactions, epochs=epochs, user_features=user_features, item_features=item_features)
			train_precision = precision_at_k(model, interactions, k=10, user_features=user_features, item_features=item_features).mean()

			train_auc = auc_score(model, interactions, user_features=user_features, item_features=item_features).mean()

			print(f'Precision: {train_precision}, AUC: {train_auc}, {datetime.datetime.now()}')

			k = 18
			top_movie_ids, scores = predict_top_k_movies(model, new_friend_id, k, num_items, user_features=user_features, item_features=item_features, use_features = False)
			top_movies = df_movies[df_movies.movie_id_ml.isin(top_movie_ids)]

			pu = recommendation_siamese(top_movies, scores)

			return(render_template('main.html', settings = {'friendsInfo':False, 'showVote': False, 'people': 0, 'buttonDisable': False,'chooseRecommendation':False, 'recommendation': pu}))
		
		# Collect friends info
		elif 'person-select-gender-0' in request.form:
			for i in range(session['members']):
				session['userAges'].append(int(request.form.get(f'age-{i}')))
				session['userGenders'].append(int(request.form.get(f'person-select-gender-{i}')))

			return(render_template('main.html', settings = {'friendsInfo':False, 'showVote': True, 'people': session['members'], 'buttonDisable': True,'chooseRecommendation':False, 'recommendation': None}))

		# Choose number of people in the group
		elif 'people-select' in request.form:
			count = int(request.form.get('people-select'))
			session['members'] = count
			session['arr'] = [[0 for x in range(15)] for y in range(count)] 
			return(render_template('main.html', settings = {'friendsInfo':True, 'showVote': False, 'people': count, 'buttonDisable': True,'chooseRecommendation':False, 'recommendation': None}))

		# All people voting
		elif 'person-select-0' in request.form:
			for i in range(session['members']):
				session['arr'][i][session['counter']] = int(request.form.get(f'person-select-{i}'))
			
			session['counter'] += 1 
			if session['counter'] < 15:     
				return(render_template('main.html', settings = {'friendsInfo':False, 'showVote': True, 'people': len(request.form), 'buttonDisable': True,'chooseRecommendation':False, 'recommendation': None}))
			else:
				return(render_template('main.html', settings = {'friendsInfo':False, 'showVote': False, 'people': len(request.form), 'buttonDisable': True,'chooseRecommendation':True,  'recommendation': None}))

	elif request.method == 'GET':
		session.clear()
		top_trending_ids = list(df_movies.sort_values(by="trending_score").head(200).sample(15).movie_id_ml)
		print(top_trending_ids)
		print(list(df_movies[df_movies.movie_id_ml.isin(top_trending_ids)].title) )
		session['counter'] = 0
		session['members'] = 0
		session['userAges'] = []
		session['userGenders'] = []
		session['movieIds'] = list(df_movies[df_movies.movie_id_ml.isin(top_trending_ids)].movie_id_ml) 
		session['top15'] = list(df_movies[df_movies.movie_id_ml.isin(top_trending_ids)].title) 
		session['top15_posters'] = list(df_movies[df_movies.movie_id_ml.isin(top_trending_ids)].poster_url)
		session['arr'] = None

		return(render_template('main.html', settings = {'showVote': False, 'people': 0, 'buttonDisable': False, 'recommendation': None}))
Example #24
0
def lightfm_node(X1_train, X2_train, X1_test, X2_test):
    X2 = pd.concat([X2_train, X2_test])
    X1 = pd.concat([X1_train, X1_test]).set_index('id')

    X1.columns = ['X1_' + i for i in X1.columns]

    X1['X1_5'] = pd.qcut(X1['X1_5'], np.arange(0, 1, 0.1), duplicates='drop')
    X1['X1_8'] = pd.qcut(X1['X1_8'], np.arange(0, 1, 0.1), duplicates='drop')
    X1['X1_6'] = pd.qcut(X1['X1_6'], np.arange(0, 1, 0.1), duplicates='drop')

    for col in ['X1_6', 'X1_8', 'X1_5', 'X1_1', 'X1_13']:
        X1[col] = X1[col].map(lambda x: '{' + col + '}_{' + str(x) + '}')

    X1 = X1.reset_index()

    from lightfm.data import Dataset
    dataset = Dataset()
    dataset.fit(users=(x for x in X2['id']), items=(x for x in X2['A']))

    dataset.fit_partial(users=(x for x in X1['id']),
                        user_features=(x for x in X1['X1_1']))
    dataset.fit_partial(users=(x for x in X1['id']),
                        user_features=(x for x in X1['X1_13']))
    dataset.fit_partial(users=(x for x in X1['id']),
                        user_features=(x for x in X1['X1_5']))
    dataset.fit_partial(users=(x for x in X1['id']),
                        user_features=(x for x in X1['X1_8']))
    dataset.fit_partial(users=(x for x in X1['id']),
                        user_features=(x for x in X1['X1_6']))

    user_features = dataset.build_user_features(
        [(x[1]['id'], x[1][['X1_1', 'X1_13', 'X1_5', 'X1_8', 'X1_6'
                            ]].values.tolist()) for x in X1.iterrows()],
        normalize=True)

    (interactions,
     weights) = dataset.build_interactions(zip(*X2[['id', 'A']].values.T))

    model = LightFM(no_components=32,
                    learning_rate=0.04,
                    loss='bpr',
                    max_sampled=55,
                    random_state=0)
    num_epochs = 20
    for i in range(num_epochs):
        model.fit_partial(interactions, user_features=user_features)

    users_mapping, user_features_mapping, assets_mapping, asset_features_mapping = dataset.mapping(
    )
    user_features_mapping_inv = {
        j: i
        for i, j in user_features_mapping.items()
    }

    tag_embeddings = (model.user_embeddings.T /
                      np.linalg.norm(model.user_embeddings, axis=1)).T

    lightfm_embed = pd.DataFrame(tag_embeddings[:len(users_mapping)],
                                 index=X1['id'])

    return lightfm_embed
    def obtener_matrices_gui(self, ruta_ratings, sep_ratings, encoding_ratings,
                             ruta_users, sep_users, encoding_users, ruta_items,
                             sep_items, encoding_items):
        """
        Método obtener_matrices_gui. Obtiene las matrices necesarias para la creación de los modelos de LightFM.

        Este método solo se utiliza en la interfaz web.

        Parameters
        ----------

        ruta_ratings: str
            ruta del archivo que contiene las valoraciones.
        sep_ratings: str
            separador utilizado en el archivo de valoraiones.
        encoding_ratings: str
            encoding utilizado en el archivo de valoraciones.
        ruta_users: str
            ruta del archivo que contiene los datos de los usuarios.
        sep_users: str
            separador utilizado en el archivo de usuarios.
        encoding_users: str
            encoding utilizado en el archivo de usuarios.
        ruta_items: str
            ruta del archivo que contiene los datos de los ítems.
        sep_items: str
            separador utilizado en el archivo de ítems.
        encoding_items: str
            encoding utilizado en el archivo de ítems.
        """

        global train, test, item_features, user_features

        # Se obtienen los dataframes
        ratings_df = Entrada.leer_csv(ruta_ratings, sep_ratings,
                                      encoding_ratings)
        ratings_df.sort_values(
            [ratings_df.columns.values[0], ratings_df.columns.values[1]],
            inplace=True)
        users_df = Entrada.leer_csv(ruta_users, sep_users, encoding_users)
        users_df.sort_values([users_df.columns.values[0]], inplace=True)
        items_df = Entrada.leer_csv(ruta_items, sep_items, encoding_items)
        items_df.sort_values([items_df.columns.values[0]], inplace=True)

        # Se transforman los dataframes en matrices que puedan ser utilzadas por los modelos
        dataset = Dataset()
        dataset.fit(users_df[users_df.columns.values[0]],
                    items_df[items_df.columns.values[0]],
                    user_features=users_df[users_df.columns.values[1]],
                    item_features=items_df[items_df.columns.values[1]])

        # Si el modelo es colaborativo o híbrido se tienen en cuenta las valoraciones de los usuarios
        if self.opcion_modelo == 1 or self.opcion_modelo == 2:
            (interacciones, pesos) = dataset.build_interactions(
                (row[ratings_df.columns.values[0]],
                 row[ratings_df.columns.values[1]],
                 row[ratings_df.columns.values[2]])
                for index, row in ratings_df.iterrows())
        else:
            (interacciones, pesos) = dataset.build_interactions(
                (row[ratings_df.columns.values[0]],
                 row[ratings_df.columns.values[1]])
                for index, row in ratings_df.iterrows())

        # Se obtienen las matrices de features y se guardan
        item_features = dataset.build_item_features(
            (row[items_df.columns.values[0]],
             [row[items_df.columns.values[1]]])
            for index, row in items_df.iterrows())
        user_features = dataset.build_user_features(
            (row[users_df.columns.values[0]],
             [row[users_df.columns.values[1]]])
            for index, row in users_df.iterrows())
        print("Guarda la matriz de item features")
        guardar_datos_pickle(item_features, 'la matriz de item features')
        print("Guarda la matriz de user features")
        guardar_datos_pickle(user_features, 'la matriz de user feautures')

        # Se dividen las interacciones en conjuntos de entrenamiento y test y se guardan
        train, test = random_train_test_split(interacciones,
                                              test_percentage=0.2)
        print("Guarda la matriz de entrenamiento")
        guardar_datos_pickle(train, 'la matriz de entrenamiento')
        print("Guarda la matriz de test")
        guardar_datos_pickle(test, 'la matriz de test')
Example #26
0
def evaluate_model(
                  df, user_id_col='user_id',
                  item_id_col='business_id', stratify=None):
    """ Model evaluation.
    Args:
        df: the input dataframe.
        user_id_col: user id column.
        item_id_col: item id column.
        stratify: if use stratification.
    No return value
    """
    # create test and train datasets
    print('model evaluation')
    train, test = train_test_split(df, test_size=0.2, stratify=stratify)
    ds = Dataset()
    # we call fit to supply userid, item id and user/item features
    user_cols = ['user_id', 'average_stars']
    categories = [c for c in df.columns if c[0].isupper()]
    item_cols = ['business_id', 'state']

    for i in df.columns[10:]:
        item_cols.append(str(i))

    user_features = user_cols[1:]
    item_features = item_cols[2:]

    ds.fit(
        df[user_id_col].unique(),  # all the users
        df[item_id_col].unique(),  # all the items
        user_features=user_features,  # additional user features
        item_features=item_features
         )

    train_users = train.drop_duplicates('user_id')
    # train_users = train[train.duplicated('user_id') == False]
    train_user_features = []
    for i in range(len(train_users)):
        train_user_features.append(get_users_features_tuple(
            train_users.values[i]))
    train_user_features = ds.build_user_features(
        train_user_features, normalize=False)

    test_users = test.drop_duplicates('user_id')
    # test_users = test[test.duplicated('user_id') == False]
    test_user1_features = []
    for i in range(len(test_users)):
        test_user1_features.append(get_users_features_tuple(
            test_users.values[i]))
    test_user_features = ds.build_user_features(
        test_user1_features, normalize=False)

    train_items = train.drop_duplicates('business_id')
    # train_items = train[train.duplicated('business_id') == False]
    train_item1_features = []
    for i in range(len(train_items)):
        train_item1_features.append(get_items_features_tuple(
            train_items.values[i], categories))
    train_item_features = ds.build_item_features(
        train_item1_features, normalize=False)

    test_items = test.drop_duplicates('business_id')
    # test_items = test[test.duplicated('business_id') == False]
    test_item_features = []
    for i in range(len(test_items)):
        test_item_features.append(get_items_features_tuple(
            test_items.values[i], categories))
    test_item_features = ds.build_item_features(
        test_item_features, normalize=False)

    # plugging in the interactions and their weights
    (train_interactions, train_weights) = ds.build_interactions(
        [(x[0], x[1], x[2]) for x in train.values])
    (test_interactions, test_weights) = ds.build_interactions(
        [(x[0], x[1], x[2]) for x in test.values])

    # model
    model = LightFM(
        no_components=100, learning_rate=0.05, loss='warp', max_sampled=50)
    model.fit(
        train_interactions, user_features=train_user_features,
        item_features=train_item_features, sample_weight=train_weights,
        epochs=10, num_threads=10)

    # auc-roc
    train_auc = auc_score(
        model, train_interactions, user_features=train_user_features,
        item_features=train_item_features, num_threads=20).mean()
    print('Training set AUC: %s' % train_auc)
    test_auc = auc_score(
        model, test_interactions, user_features=test_user_features,
        item_features=test_item_features, num_threads=20).mean()
    print('Testing set AUC: %s' % test_auc)
Example #27
0
def predict_hard_users(
    train: pd.DataFrame,
    test: pd.DataFrame,
    genre: pd.DataFrame,
    education: pd.DataFrame,
    notices: pd.DataFrame,
    available_notices: set,
    applicant_notice: dict,
    header=None,
):
    user_feature = genre.merge(education, on="idpostulante", how="left")
    user_feature.drop(columns=["fechanacimiento"], inplace=True)
    user_feature_hard_user = user_feature[user_feature.idpostulante.isin(
        train.idpostulante)]

    uf = generate_features(user_feature[["sexo", "nombre", "estado"]])
    itf = generate_features(notices[[
        "nombre_zona", "tipo_de_trabajo", "nivel_laboral", "nombre_area"
    ]])

    dataset1 = Dataset()
    dataset1.fit(
        train.idpostulante.unique(),  # all the users
        notices.idaviso.unique(),
        user_features=uf,  # additional user features
        item_features=itf,  # additional item features
    )
    # plugging in the interactions and their weights
    (interactions, weights) = dataset1.build_interactions([
        (x[1], x[0], x[3]) for x in train.values
    ])

    user_feature_list = generate_in_use_features(
        user_feature_hard_user[["sexo", "nombre", "estado"]].values,
        ["sexo", "nombre", "estado"],
    )
    user_tuple = list(
        zip(user_feature_hard_user.idpostulante, user_feature_list))

    user_features = dataset1.build_user_features(user_tuple, normalize=False)

    (
        user_id_map,
        user_feature_map,
        item_id_map,
        item_feature_map,
    ) = dataset1.mapping()

    inv_item_id_map = {v: k for k, v in item_id_map.items()}

    # for component in [10, 35, 50, 80, 100, 200]:
    component = 35
    model = lfm.LightFM(no_components=component, loss="warp", random_state=42)
    model.fit(
        interactions,
        # user_features=user_features,
        # sample_weight=weights,
        epochs=150,
        num_threads=8,
        verbose=True,
    )

    test_precision = precision_at_k(
        model,
        interactions,
        # user_features=user_features,
        k=10,
        num_threads=8,
    ).mean()
    logger.info(
        f"Evaluation for LightFM is: {test_precision} with {component} number of component"
    )

    final_predictions = {}
    for a_user in tqdm(test.idpostulante.unique()):
        try:
            notices_by_user = applicant_notice[a_user]
        except:
            notices_by_user = set()
        try:
            user_x = user_id_map[a_user]
        except:
            user_x = 0
        n_users, n_items = interactions.shape
        prediction = np.argsort(
            model.predict(
                user_x,
                np.arange(n_items),
                # user_features=user_features,
            ))[::-1]
        prediction_for_user = []
        for pred in prediction:
            notice = inv_item_id_map[pred]
            should_add = (notice in available_notices
                          and notice not in notices_by_user)
            if should_add:
                prediction_for_user += [notice]
            if len(prediction_for_user) == 10:
                break
        final_predictions[a_user] = prediction_for_user

    write_dict(final_predictions, "lightfm", header)
    return ["lightfm"]
Example #28
0
    business_stats: Dict[str, Business] = Business.load_from_file(
        business_stats_file)

    print('[ %04ds ] Files loaded' % (time.time() - start_time))

    all_user_features = ['NO_FEAT']
    all_business_features = Business.collect_business_features(business_stats)

    dataset = Dataset()
    dataset.fit(User.extract_user_ids(user_stats),
                Business.extract_business_ids(business_stats),
                user_features=all_user_features,
                item_features=all_business_features)

    user_features = dataset.build_user_features(
        User.build_user_features(user_stats,
                                 User.extract_user_ids(user_stats)), True)

    business_features = dataset.build_item_features(
        Business.build_business_features(
            business_stats, Business.extract_business_ids(business_stats)),
        True)

    print('[ %04ds ] Dataset initialized' % (time.time() - start_time))

    user_avg, user_std = Review.extract_user_average_and_std(training_set)
    normalized_training_reviews = Review.normalize_by_user(
        training_set, user_avg)
    training_interactions = Review.extract_sparse_interaction_matrix(
        normalized_training_reviews)
Example #29
0
def preprocess():
    import pandas as pd
    import math
    import numpy as np 
            
    data_users = pd.read_csv('users_tag.csv',index_col=0)
    data_business = pd.read_csv('business_Nora.csv',index_col=0)
    data_review = pd.read_csv('reviews_cleaned.csv',index_col = 0)        
            
    data_users.review_count = pd.Series([math.log(x+1) for x in data_users.review_count])
    data_users.useful =  pd.Series([math.log(x+1) for x in data_users.useful])  
            
    #cleam business skewness
    data_business.review_count =  pd.Series([math.log(x+1) for x in data_business.review_count])        
            
    from lightfm.data import Dataset        
            
    #model establishment
    dataset = Dataset()
    dataset.fit(data_review.user_id,data_review.business_id)
    type(dataset)
    num_users, num_items = dataset.interactions_shape()        
            
    # fit item and user features. 
    dataset.fit_partial(items=data_business.business_id,
                        item_features=['stars'])
            
            
    dataset.fit_partial(items=data_business.business_id,
                        item_features=['review_count'])        
            
    tar_cols = [x for x in data_business.columns[24:]] 
            
    dataset.fit_partial(items = data_business.business_id,
                       item_features = tar_cols)        
            
    user_cols = [x for x in data_users[['review_count', 'useful',
                                       'Ice Cream & Frozen Yogurt', 'Korean', 'Tapas/Small Plates',
           'Vietnamese', 'Vegan', 'Caribbean', 'Food Delivery Services', 'Lounges',
           'Pubs', 'Greek', 'Cocktail Bars', 'Mexican', 'Wine Bars', 'Tea Rooms',
           'Delis', 'Vegetarian', 'Ethnic Food', 'Salad', 'Seafood', 'Beer',
           'American (New)', 'Juice Bars & Smoothies', 'Shopping', 'Barbeque',
           'Sports Bars', 'French', 'Chicken Wings', 'Gastropubs', 'Diners',
           'Gluten-Free', 'Thai', 'Comfort Food', 'Health Markets', 'Halal',
           'Caterers', 'Arts & Entertainment']]]        
            
    dataset.fit_partial(users=data_users.user_id,
                        user_features = user_cols)  
          
    print("Building Interactions")        
    (interactions, weights) = dataset.build_interactions([(x['user_id'],
                                                           x['business_id'],
                                                           x['stars']) for index,x in data_review.iterrows()])   
    print("Interactions Build")        
    # build user and item features
    
    def build_dict(df,tar_cols,val_list):
        rst = {}
        for col in tar_cols:
            rst[col] = df[col]
        sum_val = sum(list(rst.values())) # get sum of all the tfidf values
        
        if(sum_val == 0):
            return rst
        else:
            
            w = (2-sum(val_list))/sum_val # weight for each tag to be able to sum to 1
            for key,value in rst.items():
                rst[key] = value * w
        return rst
    
    def user_build_dict(df,tar_cols,val_list):
        rst = {}
        for col in tar_cols:
            rst[col] = df[col]
        sum_val = sum(list(rst.values())) # get sum of all the tfidf values
        
        if(sum_val == 0):
            return rst
        else:
            w = (2-sum(val_list))/sum_val # weight for each tag to be able to sum to 1
            for key,value in rst.items():
                rst[key] = value * w
        return rst
    
    # get max of each column to regularize value to [0,1]
    max_star = max(data_business.stars)
    max_b_rc = max(data_business.review_count)
    print('max_b_rc')
    print(max_b_rc)
    
    # give CF info weight 0.5, all other 0.5. Then in others, give (star, review count) 0.25 and tags 0.25
    item_features = dataset.build_item_features(((x['business_id'], 
                                                  {'stars':0.5*x['stars']/max_star,
                                                   'review_count':0.5*x['review_count']/max_b_rc,
                                                   **build_dict(x,tar_cols,[0.5*x['stars']/max_star,
                                                               0.5*x['review_count']/max_b_rc])})
                                                  for index,x in data_business.iterrows()))
    
    
    # user_features = dataset.build_user_features(((x['user_id'],
    #                                              [x['is_elite'],x['year']])
    #                                            for index, x in data_users.iterrows()))
    max_u_rc = max(data_users.review_count)
    max_useful = max(data_users.useful)
    user_features = dataset.build_user_features(((x['user_id'],
                                                 {'review_count':0.35*x['review_count']/max_u_rc,
                                                  'useful':0.35*x['useful']/max_useful,
                                                 **user_build_dict(x,user_cols,[0.35*x['review_count']/max_u_rc,0.35*x['useful']/max_useful])}) for index, x in data_users.iterrows()))
            
    #train-test split
    
    # seed = 12345 #has multiple seeds set up to account for split biases
    # seed = 101
    # seed = 186
    seed = 123
    from lightfm.cross_validation import random_train_test_split
    train,test=random_train_test_split(interactions,test_percentage=0.2,random_state=np.random.RandomState(seed))
    
    print('The dataset has %s users and %s items, '
          'with %s interactions in the test and %s interactions in the training set.'
          % (train.shape[0], train.shape[1], test.getnnz(), train.getnnz()))
    
    train.multiply(test).nnz == 0 # make sure train and test are truly disjoint        
    return train,test,data_business,dataset,user_features,item_features