Esempio n. 1
0
def test_random_splitter(test_specs, python_dataset):
    """NOTE: some split results may not match exactly with the ratios, which may be owing to the  limited number of 
    rows in the testing data. A approximate match with certain level of tolerance is therefore used instead for tests.
    """
    splits = python_random_split(python_dataset,
                                 ratio=test_specs["ratio"],
                                 seed=test_specs["seed"])
    assert len(splits[0]) / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratio"], test_specs["tolerance"])
    assert len(splits[1]) / test_specs["number_of_rows"] == pytest.approx(
        1 - test_specs["ratio"], test_specs["tolerance"])

    splits = python_random_split(python_dataset,
                                 ratio=test_specs["ratios"],
                                 seed=test_specs["seed"])

    assert len(splits) == 3
    assert len(splits[0]) / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratios"][0], test_specs["tolerance"])
    assert len(splits[1]) / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratios"][1], test_specs["tolerance"])
    assert len(splits[2]) / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratios"][2], test_specs["tolerance"])

    splits = python_random_split(python_dataset,
                                 ratio=test_specs["split_numbers"],
                                 seed=test_specs["seed"])

    assert len(splits) == 3
    assert len(splits[0]) / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratios"][0], test_specs["tolerance"])
    assert len(splits[1]) / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratios"][1], test_specs["tolerance"])
    assert len(splits[2]) / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratios"][2], test_specs["tolerance"])
def test_random_splitter(test_specs, python_dataset):
    """NOTE: some split results may not match exactly with the ratios, which may be owing to the  limited number of 
    rows in the testing data. A approximate match with certain level of tolerance is therefore used instead for tests.
    """
    splits = python_random_split(
        python_dataset, ratio=test_specs["ratio"], seed=test_specs["seed"]
    )
    assert len(splits[0]) / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratio"], test_specs["tolerance"]
    )
    assert len(splits[1]) / test_specs["number_of_rows"] == pytest.approx(
        1 - test_specs["ratio"], test_specs["tolerance"]
    )

    for split in splits:
        assert set(split.columns) == set(python_dataset.columns)

    splits = python_random_split(
        python_dataset, ratio=test_specs["ratios"], seed=test_specs["seed"]
    )

    assert len(splits) == 3
    assert len(splits[0]) / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratios"][0], test_specs["tolerance"]
    )
    assert len(splits[1]) / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratios"][1], test_specs["tolerance"]
    )
    assert len(splits[2]) / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratios"][2], test_specs["tolerance"]
    )

    for split in splits:
        assert set(split.columns) == set(python_dataset.columns)

    splits = python_random_split(
        python_dataset, ratio=test_specs["split_numbers"], seed=test_specs["seed"]
    )

    assert len(splits) == 3
    assert len(splits[0]) / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratios"][0], test_specs["tolerance"]
    )
    assert len(splits[1]) / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratios"][1], test_specs["tolerance"]
    )
    assert len(splits[2]) / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratios"][2], test_specs["tolerance"]
    )

    for split in splits:
        assert set(split.columns) == set(python_dataset.columns)
Esempio n. 3
0
        data.head()

        # data = pd.read_csv("/content/snackratings.csv")
        # data.head()

        print(
            "Total number of ratings are\t{}".format(data.shape[0]),
            "Total number of users are\t{}".format(data[USER].nunique()),
            "Total number of items are\t{}".format(data[ITEM].nunique()),
            sep="\n"
        )

        #st.subheader("data loaded")

        data_train, data_test = python_random_split(data, ratio=0.7)

        split =data_train.shape[0], data_test.shape[0]
        st.write("Splitting_Ratio:",split)

        data = CollabDataBunch.from_df(data_train, seed=42, valid_pct=0.1)

        y_range = [0.5,5.5]
        st.write(y_range)

        factor=N_FACTORS
        st.write("No. of factors:",factor)

        learn = collab_learner(data, n_factors=factor, y_range=y_range, wd=1e-1)
        
        learn.model
Esempio n. 4
0
                    default=10,
                    help='top k items to recommend')
parser.add_argument('--data-size',
                    type=str,
                    dest='data_size',
                    default=10,
                    help='Movielens data size: 100k, 1m, 10m, or 20m')
args = parser.parse_args()

run.log("top-k", args.top_k)
run.log("data-size", args.data_size)
data_pickle_path = os.path.join(args.data_folder, args.data_file)

data = pd.read_pickle(path=data_pickle_path)

train, test = python_random_split(data, 0.75)

# instantiate the SAR algorithm and set the index
header = {
    "col_user": "******",
    "col_item": "MovieId",
    "col_rating": "Rating",
    "col_timestamp": "Timestamp",
}

logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s %(levelname)-8s %(message)s')

model = SARSingleNode(remove_seen=True,
                      similarity_type="jaccard",
                      time_decay_coefficient=30,