Esempio n. 1
0
def main():
    args = parse_args()
    np.random.seed(args.seed)

    print("Loading raw data from {}".format(args.path))
    df = implicit_load(args.path, sort=False)
    print("Filtering out users with less than {} ratings".format(MIN_RATINGS))
    grouped = df.groupby(USER_COLUMN)
    df = grouped.filter(lambda x: len(x) >= MIN_RATINGS)

    print("Mapping original user and item IDs to new sequential IDs")
    original_users = df[USER_COLUMN].unique()
    original_items = df[ITEM_COLUMN].unique()

    user_map = {user: index for index, user in enumerate(original_users)}
    item_map = {item: index for index, item in enumerate(original_items)}

    df[USER_COLUMN] = df[USER_COLUMN].apply(lambda user: user_map[user])
    df[ITEM_COLUMN] = df[ITEM_COLUMN].apply(lambda item: item_map[item])

    assert df[USER_COLUMN].max() == len(original_users) - 1
    assert df[ITEM_COLUMN].max() == len(original_items) - 1

    print("Creating list of items for each user")
    # Need to sort before popping to get last item
    df.sort_values(by='timestamp', inplace=True)
    all_ratings = set(zip(df[USER_COLUMN], df[ITEM_COLUMN]))
    user_to_items = defaultdict(list)
    for row in tqdm(df.itertuples(), desc='Ratings', total=len(df)):
        user_to_items[getattr(row, USER_COLUMN)].append(getattr(row, ITEM_COLUMN))  # noqa: E501

    test_ratings = []
    test_negs = []
    all_items = set(range(len(original_items)))
    print("Generating {} negative samples for each user"
          .format(args.negatives))
    for user in tqdm(range(len(original_users)), desc='Users', total=len(original_users)):  # noqa: E501
        test_item = user_to_items[user].pop()

        all_ratings.remove((user, test_item))
        all_negs = all_items - set(user_to_items[user])
        all_negs = sorted(list(all_negs))  # determinism

        test_ratings.append((user, test_item))
        test_negs.append(list(np.random.choice(all_negs, args.negatives)))

    print("Saving train and test CSV files to {}".format(args.output))
    df_train_ratings = pd.DataFrame(list(all_ratings))
    df_train_ratings['fake_rating'] = 1
    df_train_ratings.to_csv(os.path.join(args.output, TRAIN_RATINGS_FILENAME),
                            index=False, header=False, sep='\t')

    df_test_ratings = pd.DataFrame(test_ratings)
    df_test_ratings['fake_rating'] = 1
    df_test_ratings.to_csv(os.path.join(args.output, TEST_RATINGS_FILENAME),
                           index=False, header=False, sep='\t')

    df_test_negs = pd.DataFrame(test_negs)
    df_test_negs.to_csv(os.path.join(args.output, TEST_NEG_FILENAME),
                        index=False, header=False, sep='\t')
def main():
    args = parse_args()

    print("Loading raw data from {}".format(args.path))
    df = implicit_load(args.path, sort=False)

    print("Filtering out users with less than {} ratings".format(MIN_RATINGS))
    grouped = df.groupby(USER_COLUMN)
    LOGGER.log(key=tags.PREPROC_HP_MIN_RATINGS, value=MIN_RATINGS)
    df = grouped.filter(lambda x: len(x) >= MIN_RATINGS)

    print("Mapping original user and item IDs to new sequential IDs")
    df[USER_COLUMN] = pd.factorize(df[USER_COLUMN])[0]
    df[ITEM_COLUMN] = pd.factorize(df[ITEM_COLUMN])[0]

    print("Creating list of items for each user")
    # Need to sort before popping to get last item
    df.sort_values(by='timestamp', inplace=True)

    # clean up data
    del df['rating'], df['timestamp']
    df = df.drop_duplicates()  # assuming it keeps order

    # now we have filtered and sorted by time data, we can split test data out
    grouped_sorted = df.groupby(USER_COLUMN, group_keys=False)
    test_data = grouped_sorted.tail(1).sort_values(by='user_id')
    # need to pop for each group
    train_data = grouped_sorted.apply(lambda x: x.iloc[:-1])
    train_data = train_data.sort_values([USER_COLUMN, ITEM_COLUMN])

    train_data.to_pickle(args.output + '/train_ratings.pickle')
    test_data.to_pickle(args.output + '/test_ratings.pickle')
Esempio n. 3
0
def main():
    args = parse_args()

    print("Loading raw data from {}".format(args.path))
    df = implicit_load(args.path, sort=False)

    print("Filtering out users with less than {} ratings".format(MIN_RATINGS))
    grouped = df.groupby(USER_COLUMN)
    df = grouped.filter(lambda x: len(x) >= MIN_RATINGS)

    print("Mapping original user and item IDs to new sequential IDs")
    df[USER_COLUMN] = pd.factorize(df[USER_COLUMN])[0]
    df[ITEM_COLUMN] = pd.factorize(df[ITEM_COLUMN])[0]

    print("Creating list of items for each user")
    # Need to sort before popping to get last item
    df.sort_values(by='timestamp', inplace=True)

    # clean up data
    del df['rating'], df['timestamp']
    df = df.drop_duplicates()  # assuming it keeps order

    # now we have filtered and sorted by time data, we can split test data out
    grouped_sorted = df.groupby(USER_COLUMN, group_keys=False)
    test_data = grouped_sorted.tail(1).sort_values(by='user_id')
    # need to pop for each group
    train_data = grouped_sorted.apply(lambda x: x.iloc[:-1])

    # Note: no way to keep reference training data ordering because use of python set and multi-process
    # It should not matter since it will be later randomized again
    # save train and val data that is fixed.
    train_ratings = torch.from_numpy(train_data.values)
    torch.save(train_ratings, args.output + '/train_ratings.pt')
    test_ratings = torch.from_numpy(test_data.values)
    torch.save(test_ratings, args.output + '/test_ratings.pt')
Esempio n. 4
0
def main():
    # TODO: Add random seed as parameter
    np.random.seed(0)
    args = parse_args()

    df = implicit_load(args.path, sort=False)
    grouped = df.groupby(USER_COLUMN)
    df = grouped.filter(lambda x: len(x) >= 20)

    original_users = df[USER_COLUMN].unique()
    original_items = df[ITEM_COLUMN].unique()

    user_map = {user: index for index, user in enumerate(original_users)}
    item_map = {item: index for index, item in enumerate(original_items)}

    df[USER_COLUMN] = df[USER_COLUMN].apply(lambda user: user_map[user])
    df[ITEM_COLUMN] = df[ITEM_COLUMN].apply(lambda item: item_map[item])

    assert df[USER_COLUMN].max() == len(original_users) - 1
    assert df[ITEM_COLUMN].max() == len(original_items) - 1

    # Need to sort before popping to get last item
    df.sort_values(by='timestamp', inplace=True)
    all_ratings = set(zip(df[USER_COLUMN], df[ITEM_COLUMN]))
    user_to_items = defaultdict(list)
    for row in df.itertuples():
        user_to_items[getattr(row, USER_COLUMN)].append(getattr(row, ITEM_COLUMN))  # noqa: E501

    test_ratings = []
    test_negs = []
    all_items = set(range(len(original_items)))
    for user in range(len(original_users)):
        test_item = user_to_items[user].pop()

        all_ratings.remove((user, test_item))
        all_negs = all_items - set(user_to_items[user])
        all_negs = sorted(list(all_negs))  # determinism

        test_ratings.append((user, test_item))
        test_negs.append(list(np.random.choice(all_negs, NUMBER_NEGATIVES)))

    # serialize
    df_train_ratings = pd.DataFrame(list(all_ratings))
    df_train_ratings['fake_rating'] = 1
    df_train_ratings.to_csv(os.path.join(args.output, TRAIN_RATINGS_FILENAME),
                            index=False, header=False, sep='\t')

    df_test_ratings = pd.DataFrame(test_ratings)
    df_test_ratings['fake_rating'] = 1
    df_test_ratings.to_csv(os.path.join(args.output, TEST_RATINGS_FILENAME),
                           index=False, header=False, sep='\t')

    df_test_negs = pd.DataFrame(test_negs)
    df_test_negs.to_csv(os.path.join(args.output, TEST_NEG_FILENAME),
                        index=False, header=False, sep='\t')
Esempio n. 5
0
def main():
    args = parse_args()
    np.random.seed(args.seed)

    print("\nLoading raw data from {}\n".format(args.file))
    df = implicit_load(args.file, sort=False)

    print(
        "\nFiltering out users with less than {} ratings".format(MIN_RATINGS))
    grouped = df.groupby(USER_COLUMN)
    df = grouped.filter(lambda x: len(x) >= MIN_RATINGS)

    print("Mapping original user and item IDs to new sequential IDs")
    original_users = df[USER_COLUMN].unique()
    original_items = df[ITEM_COLUMN].unique()

    nb_users = len(original_users)
    nb_items = len(original_items)

    user_map = {user: index for index, user in enumerate(original_users)}
    item_map = {item: index for index, item in enumerate(original_items)}

    df[USER_COLUMN] = df[USER_COLUMN].apply(lambda user: user_map[user])
    df[ITEM_COLUMN] = df[ITEM_COLUMN].apply(lambda item: item_map[item])

    assert df[USER_COLUMN].max() == len(original_users) - 1
    assert df[ITEM_COLUMN].max() == len(original_items) - 1

    print("Creating list of items for each user")
    # Need to sort before popping to get last item
    df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
    df.sort_values(by='timestamp', inplace=True)
    all_ratings = set(zip(df[USER_COLUMN], df[ITEM_COLUMN]))
    user_to_items = defaultdict(list)
    for row in tqdm(df.itertuples(), desc='Ratings', total=len(df)):
        user_to_items[getattr(row, USER_COLUMN)].append(
            getattr(row, ITEM_COLUMN))  # noqa: E501

    train_ratings = []
    test_ratings = []
    test_negs = []
    all_items = set(range(len(original_items)))

    print(
        "Generating {} negative samples for each user and creating training set"
        .format(args.negatives))

    for user in tqdm(range(len(original_users)),
                     desc='Users',
                     total=len(original_users)):  # noqa: E501
        all_negs = all_items - set(user_to_items[user])
        all_negs = sorted(list(all_negs))  # determinism
        negs = random.sample(all_negs, args.negatives)

        test_item = user_to_items[user].pop()

        all_ratings.remove((user, test_item))

        tmp = []
        tmp.extend([user, test_item])
        tmp.extend(negs)
        test_negs.append(list(tmp))

        tmp = []
        tmp.extend([user, test_item])
        tmp.extend(user_to_items[user][-args.history_size:])
        test_ratings.append(list(tmp))

        while len(user_to_items[user]) > args.history_size:
            tgItem = user_to_items[user].pop()
            tmp = []
            tmp.extend([user, tgItem])
            tmp.extend(user_to_items[user][-args.history_size:])
            train_ratings.append(list(tmp))

    print("\nSaving train and test CSV files to {}".format(args.output))

    df_train_ratings = pd.DataFrame(list(train_ratings))

    print('Saving data description ...')
    f_writer = open(os.path.join(OUTPUT, 'data_summary.txt'), 'w')
    f_writer.write('users = ' + str(nb_users) + ', items = ' + str(nb_items) +
                   ', history_size = ' + str(HISTORY_SIZE) +
                   ', train_entries = ' + str(len(df_train_ratings)))

    df_train_ratings['fake_rating'] = 1
    df_train_ratings.to_csv(os.path.join(args.output, TRAIN_RATINGS_FILENAME),
                            index=False,
                            header=False,
                            sep='\t')

    df_test_ratings = pd.DataFrame(test_ratings)
    df_test_ratings['fake_rating'] = 1
    df_test_ratings.to_csv(os.path.join(args.output, TEST_RATINGS_FILENAME),
                           index=False,
                           header=False,
                           sep='\t')

    df_test_negs = pd.DataFrame(test_negs)
    df_test_negs.to_csv(os.path.join(args.output, TEST_NEG_FILENAME),
                        index=False,
                        header=False,
                        sep='\t')

    print("Data preprocess done!\n")
Esempio n. 6
0
def main():
    args = parse_args()

    if args.seed is not None:
        torch.manual_seed(args.seed)

    print("Loading raw data from {}".format(args.path))
    df = implicit_load(args.path, sort=False)

    if args.test == 'less_user':
        to_drop = set(list(df[USER_COLUMN].unique())[-100:])
        df = df[~df[USER_COLUMN].isin(to_drop)]
    if args.test == 'less_item':
        to_drop = set(list(df[ITEM_COLUMN].unique())[-100:])
        df = df[~df[ITEM_COLUMN].isin(to_drop)]
    if args.test == 'more_user':
        sample = df.sample(frac=0.2).copy()
        sample[USER_COLUMN] = sample[USER_COLUMN] + 10000000
        df = df.append(sample)
        users = df[USER_COLUMN]
        df = df[users.isin(users[users.duplicated(
            keep=False)])]  # make sure something remains in the train set
    if args.test == 'more_item':
        sample = df.sample(frac=0.2).copy()
        sample[ITEM_COLUMN] = sample[ITEM_COLUMN] + 10000000
        df = df.append(sample)

    print("Mapping original user and item IDs to new sequential IDs")
    df[USER_COLUMN] = pd.factorize(df[USER_COLUMN])[0]
    df[ITEM_COLUMN] = pd.factorize(df[ITEM_COLUMN])[0]

    user_cardinality = df[USER_COLUMN].max() + 1
    item_cardinality = df[ITEM_COLUMN].max() + 1

    # Need to sort before popping to get last item
    df.sort_values(by='timestamp', inplace=True)

    # clean up data
    del df['rating'], df['timestamp']
    df = df.drop_duplicates()  # assuming it keeps order

    # Test set is the last interaction for a given user
    grouped_sorted = df.groupby(USER_COLUMN, group_keys=False)
    test_data = grouped_sorted.tail(1).sort_values(by=USER_COLUMN)
    # Train set is all interactions but the last one
    train_data = grouped_sorted.apply(lambda x: x.iloc[:-1])

    sampler = _TestNegSampler(train_data.values, args.valid_negative)
    test_negs = sampler.generate().cuda()
    if args.valid_negative > 0:
        test_negs = test_negs.reshape(-1, args.valid_negative)
    else:
        test_negs = test_negs.reshape(test_data.shape[0], 0)

    if args.test == 'more_pos':
        mask = np.random.rand(len(test_data)) < 0.5
        sample = test_data[mask].copy()
        sample[ITEM_COLUMN] = sample[ITEM_COLUMN] + 5
        test_data = test_data.append(sample)
        test_negs_copy = test_negs[mask]
        test_negs = torch.cat((test_negs, test_negs_copy), dim=0)
    if args.test == 'less_pos':
        mask = np.random.rand(len(test_data)) < 0.5
        test_data = test_data[mask]
        test_negs = test_negs[mask]

    # Reshape train set into user,item,label tabular and save
    train_ratings = torch.from_numpy(train_data.values).cuda()
    train_labels = torch.ones_like(train_ratings[:, 0:1], dtype=torch.float32)
    torch.save(train_ratings, os.path.join(args.output, TRAIN_0))
    torch.save(train_labels, os.path.join(args.output, TRAIN_1))

    # Reshape test set into user,item,label tabular and save
    # All users have the same number of items, items for a given user appear consecutively
    test_ratings = torch.from_numpy(test_data.values).cuda()
    test_users_pos = test_ratings[:, 0:
                                  1]  # slicing instead of indexing to keep dimensions
    test_items_pos = test_ratings[:, 1:2]
    test_users = test_users_pos.repeat_interleave(args.valid_negative + 1,
                                                  dim=0)
    test_items = torch.cat((test_items_pos.reshape(-1, 1), test_negs),
                           dim=1).reshape(-1, 1)
    positive_labels = torch.ones_like(test_users_pos, dtype=torch.float32)
    negative_labels = torch.zeros_like(test_users_pos,
                                       dtype=torch.float32).repeat(
                                           1, args.valid_negative)
    test_labels = torch.cat((positive_labels, negative_labels),
                            dim=1).reshape(-1, 1)
    dtypes = {
        'user': str(test_users.dtype),
        'item': str(test_items.dtype),
        'label': str(test_labels.dtype)
    }
    test_tensor = torch.cat((test_users, test_items), dim=1)
    torch.save(test_tensor, os.path.join(args.output, TEST_0))
    torch.save(test_labels, os.path.join(args.output, TEST_1))

    if args.test == 'other_names':
        dtypes = {
            'user_2': str(test_users.dtype),
            'item_2': str(test_items.dtype),
            'label_2': str(test_labels.dtype)
        }
        save_feature_spec(user_cardinality=user_cardinality,
                          item_cardinality=item_cardinality,
                          dtypes=dtypes,
                          test_negative_samples=args.valid_negative,
                          output_path=args.output + '/feature_spec.yaml',
                          user_feature_name='user_2',
                          item_feature_name='item_2',
                          label_feature_name='label_2')
    else:
        save_feature_spec(user_cardinality=user_cardinality,
                          item_cardinality=item_cardinality,
                          dtypes=dtypes,
                          test_negative_samples=args.valid_negative,
                          output_path=args.output + '/feature_spec.yaml')
Esempio n. 7
0
def main():
    args = parse_args()
    device = exp.get_device()
    chrono = exp.chrono()

    print("Loading raw data from {}".format(args.path))
    df = implicit_load(args.path, sort=False)

    # ------------------------------------------------------------------------------------------------------------------
    with chrono.time('task', skip_obs=0):
        print("Filtering out users with less than {} ratings".format(
            MIN_RATINGS))
        grouped = df.groupby(USER_COLUMN)
        # mlperf_log.ncf_print(key=# mlperf_log.PREPROC_HP_MIN_RATINGS, value=MIN_RATINGS)
        df = grouped.filter(lambda x: len(x) >= MIN_RATINGS)

        print("Mapping original user and item IDs to new sequential IDs")
        original_users = df[USER_COLUMN].unique()
        original_items = df[ITEM_COLUMN].unique()

        user_map = {user: index for index, user in enumerate(original_users)}
        item_map = {item: index for index, item in enumerate(original_items)}

        df[USER_COLUMN] = df[USER_COLUMN].apply(lambda user: user_map[user])
        df[ITEM_COLUMN] = df[ITEM_COLUMN].apply(lambda item: item_map[item])

        assert df[USER_COLUMN].max() == len(original_users) - 1
        assert df[ITEM_COLUMN].max() == len(original_items) - 1

        print("Creating list of items for each user")
        # Need to sort before popping to get last item
        df.sort_values(by='timestamp', inplace=True)
        all_ratings = set(zip(df[USER_COLUMN], df[ITEM_COLUMN]))
        user_to_items = defaultdict(list)

        for row in tqdm(df.itertuples(), desc='Ratings', total=len(df)):
            user_to_items[getattr(row, USER_COLUMN)].append(
                getattr(row, ITEM_COLUMN))  # noqa: E501

        test_ratings = []
        test_negs = []
        all_items = set(range(len(original_items)))

        print("Generating {} negative samples for each user".format(
            args.negatives))
        # mlperf_log.ncf_print(key=# mlperf_log.PREPROC_HP_NUM_EVAL, value=args.negatives)

        # The default of np.random.choice is replace=True
        # mlperf_log.ncf_print(key=# mlperf_log.PREPROC_HP_SAMPLE_EVAL_REPLACEMENT, value=True)

        #===========================================================================
        #== First random operation triggers the clock start. =======================
        #===========================================================================
        # mlperf_log.ncf_print(key=# mlperf_log.RUN_START)
        # mlperf_log.ncf_print(key=# mlperf_log.INPUT_STEP_EVAL_NEG_GEN)

        for user in tqdm(range(len(original_users)),
                         desc='Users',
                         total=len(original_users)):  # noqa: E501
            test_item = user_to_items[user].pop()

            all_ratings.remove((user, test_item))
            all_negs = all_items - set(user_to_items[user])
            all_negs = sorted(list(all_negs))  # determinism

            test_ratings.append((user, test_item))
            test_negs.append(list(np.random.choice(all_negs, args.negatives)))

        print("Saving train and test CSV files to {}".format(args.output))
        df_train_ratings = pd.DataFrame(list(all_ratings))
        df_train_ratings['fake_rating'] = 1
        df_train_ratings.to_csv(os.path.join(args.output,
                                             TRAIN_RATINGS_FILENAME),
                                index=False,
                                header=False,
                                sep='\t')

        # mlperf_log.ncf_print(key=# mlperf_log.INPUT_SIZE, value=len(df_train_ratings))

        df_test_ratings = pd.DataFrame(test_ratings)
        df_test_ratings['fake_rating'] = 1
        df_test_ratings.to_csv(os.path.join(args.output,
                                            TEST_RATINGS_FILENAME),
                               index=False,
                               header=False,
                               sep='\t')

        df_test_negs = pd.DataFrame(test_negs)
        df_test_negs.to_csv(os.path.join(args.output, TEST_NEG_FILENAME),
                            index=False,
                            header=False,
                            sep='\t')
    # ------------------------------------------------------------------------------------------------------------------

    exp.report()
Esempio n. 8
0
def main():
    args = parse_args()

    if args.seed is not None:
        torch.manual_seed(args.seed)

    print("Loading raw data from {}".format(args.path))
    df = implicit_load(args.path, sort=False)

    print("Mapping original user and item IDs to new sequential IDs")
    df[USER_COLUMN] = pd.factorize(df[USER_COLUMN])[0]
    df[ITEM_COLUMN] = pd.factorize(df[ITEM_COLUMN])[0]

    user_cardinality = df[USER_COLUMN].max() + 1
    item_cardinality = df[ITEM_COLUMN].max() + 1

    # Need to sort before popping to get last item
    df.sort_values(by='timestamp', inplace=True)

    # clean up data
    del df['rating'], df['timestamp']
    df = df.drop_duplicates()  # assuming it keeps order

    # Test set is the last interaction for a given user
    grouped_sorted = df.groupby(USER_COLUMN, group_keys=False)
    test_data = grouped_sorted.tail(1).sort_values(by=USER_COLUMN)
    # Train set is all interactions but the last one
    train_data = grouped_sorted.apply(lambda x: x.iloc[:-1])

    sampler = _TestNegSampler(train_data.values, args.valid_negative)
    test_negs = sampler.generate().cuda()
    test_negs = test_negs.reshape(-1, args.valid_negative)

    # Reshape train set into user,item,label tabular and save
    train_ratings = torch.from_numpy(train_data.values).cuda()
    train_labels = torch.ones_like(train_ratings[:, 0:1], dtype=torch.float32)
    torch.save(train_ratings, os.path.join(args.output, TRAIN_0))
    torch.save(train_labels, os.path.join(args.output, TRAIN_1))

    # Reshape test set into user,item,label tabular and save
    # All users have the same number of items, items for a given user appear consecutively
    test_ratings = torch.from_numpy(test_data.values).cuda()
    test_users_pos = test_ratings[:, 0:
                                  1]  # slicing instead of indexing to keep dimensions
    test_items_pos = test_ratings[:, 1:2]
    test_users = test_users_pos.repeat_interleave(args.valid_negative + 1,
                                                  dim=0)
    test_items = torch.cat((test_items_pos.reshape(-1, 1), test_negs),
                           dim=1).reshape(-1, 1)
    positive_labels = torch.ones_like(test_users_pos, dtype=torch.float32)
    negative_labels = torch.zeros_like(test_users_pos,
                                       dtype=torch.float32).repeat(
                                           1, args.valid_negative)
    test_labels = torch.cat((positive_labels, negative_labels),
                            dim=1).reshape(-1, 1)
    dtypes = {
        'user': str(test_users.dtype),
        'item': str(test_items.dtype),
        'label': str(test_labels.dtype)
    }
    test_tensor = torch.cat((test_users, test_items), dim=1)
    torch.save(test_tensor, os.path.join(args.output, TEST_0))
    torch.save(test_labels, os.path.join(args.output, TEST_1))

    save_feature_spec(user_cardinality=user_cardinality,
                      item_cardinality=item_cardinality,
                      dtypes=dtypes,
                      test_negative_samples=args.valid_negative,
                      output_path=args.output + '/feature_spec.yaml')