def test_random_splitter(test_specs, python_dataset): """NOTE: some split results may not match exactly with the ratios, which may be owing to the limited number of rows in the testing data. A approximate match with certain level of tolerance is therefore used instead for tests. """ splits = python_random_split(python_dataset, ratio=test_specs["ratio"], seed=test_specs["seed"]) assert len(splits[0]) / test_specs["number_of_rows"] == pytest.approx( test_specs["ratio"], test_specs["tolerance"]) assert len(splits[1]) / test_specs["number_of_rows"] == pytest.approx( 1 - test_specs["ratio"], test_specs["tolerance"]) splits = python_random_split(python_dataset, ratio=test_specs["ratios"], seed=test_specs["seed"]) assert len(splits) == 3 assert len(splits[0]) / test_specs["number_of_rows"] == pytest.approx( test_specs["ratios"][0], test_specs["tolerance"]) assert len(splits[1]) / test_specs["number_of_rows"] == pytest.approx( test_specs["ratios"][1], test_specs["tolerance"]) assert len(splits[2]) / test_specs["number_of_rows"] == pytest.approx( test_specs["ratios"][2], test_specs["tolerance"]) splits = python_random_split(python_dataset, ratio=test_specs["split_numbers"], seed=test_specs["seed"]) assert len(splits) == 3 assert len(splits[0]) / test_specs["number_of_rows"] == pytest.approx( test_specs["ratios"][0], test_specs["tolerance"]) assert len(splits[1]) / test_specs["number_of_rows"] == pytest.approx( test_specs["ratios"][1], test_specs["tolerance"]) assert len(splits[2]) / test_specs["number_of_rows"] == pytest.approx( test_specs["ratios"][2], test_specs["tolerance"])
def test_random_splitter(test_specs, python_dataset): """NOTE: some split results may not match exactly with the ratios, which may be owing to the limited number of rows in the testing data. A approximate match with certain level of tolerance is therefore used instead for tests. """ splits = python_random_split( python_dataset, ratio=test_specs["ratio"], seed=test_specs["seed"] ) assert len(splits[0]) / test_specs["number_of_rows"] == pytest.approx( test_specs["ratio"], test_specs["tolerance"] ) assert len(splits[1]) / test_specs["number_of_rows"] == pytest.approx( 1 - test_specs["ratio"], test_specs["tolerance"] ) for split in splits: assert set(split.columns) == set(python_dataset.columns) splits = python_random_split( python_dataset, ratio=test_specs["ratios"], seed=test_specs["seed"] ) assert len(splits) == 3 assert len(splits[0]) / test_specs["number_of_rows"] == pytest.approx( test_specs["ratios"][0], test_specs["tolerance"] ) assert len(splits[1]) / test_specs["number_of_rows"] == pytest.approx( test_specs["ratios"][1], test_specs["tolerance"] ) assert len(splits[2]) / test_specs["number_of_rows"] == pytest.approx( test_specs["ratios"][2], test_specs["tolerance"] ) for split in splits: assert set(split.columns) == set(python_dataset.columns) splits = python_random_split( python_dataset, ratio=test_specs["split_numbers"], seed=test_specs["seed"] ) assert len(splits) == 3 assert len(splits[0]) / test_specs["number_of_rows"] == pytest.approx( test_specs["ratios"][0], test_specs["tolerance"] ) assert len(splits[1]) / test_specs["number_of_rows"] == pytest.approx( test_specs["ratios"][1], test_specs["tolerance"] ) assert len(splits[2]) / test_specs["number_of_rows"] == pytest.approx( test_specs["ratios"][2], test_specs["tolerance"] ) for split in splits: assert set(split.columns) == set(python_dataset.columns)
data.head() # data = pd.read_csv("/content/snackratings.csv") # data.head() print( "Total number of ratings are\t{}".format(data.shape[0]), "Total number of users are\t{}".format(data[USER].nunique()), "Total number of items are\t{}".format(data[ITEM].nunique()), sep="\n" ) #st.subheader("data loaded") data_train, data_test = python_random_split(data, ratio=0.7) split =data_train.shape[0], data_test.shape[0] st.write("Splitting_Ratio:",split) data = CollabDataBunch.from_df(data_train, seed=42, valid_pct=0.1) y_range = [0.5,5.5] st.write(y_range) factor=N_FACTORS st.write("No. of factors:",factor) learn = collab_learner(data, n_factors=factor, y_range=y_range, wd=1e-1) learn.model
default=10, help='top k items to recommend') parser.add_argument('--data-size', type=str, dest='data_size', default=10, help='Movielens data size: 100k, 1m, 10m, or 20m') args = parser.parse_args() run.log("top-k", args.top_k) run.log("data-size", args.data_size) data_pickle_path = os.path.join(args.data_folder, args.data_file) data = pd.read_pickle(path=data_pickle_path) train, test = python_random_split(data, 0.75) # instantiate the SAR algorithm and set the index header = { "col_user": "******", "col_item": "MovieId", "col_rating": "Rating", "col_timestamp": "Timestamp", } logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)-8s %(message)s') model = SARSingleNode(remove_seen=True, similarity_type="jaccard", time_decay_coefficient=30,