def test_add_score_column(self): ri = RatingsImporter(source=self.raw_source, from_id_column=0, to_id_column=1, score_column=4) ratings = ri.import_ratings() expected_columns = ['from_id', 'to_id', 'score'] result_columns = list(ratings.columns) self.assertEqual(expected_columns, result_columns) ratings_w_added_column = ri.add_score_column( 'stars', column_name='score_duplicate') expected_columns = ['from_id', 'to_id', 'score', 'score_duplicate'] result_columns = list(ratings_w_added_column.columns) self.assertEqual(expected_columns, result_columns) score_column_added = list(ratings['score_duplicate']) expected = [float(row['stars']) for row in self.raw_source_content] self.assertEqual(expected, score_column_added)
def test_import_ratings_by_index(self): ri = RatingsImporter(source=self.raw_source, from_id_column=0, to_id_column=1, score_column=4) ratings = ri.import_ratings() expected_columns = ['from_id', 'to_id', 'score'] result_columns = list(ratings.columns) self.assertEqual(expected_columns, result_columns) from_id_result = list(ratings['from_id']) to_id_result = list(ratings['to_id']) score_result = list(ratings['score']) from_id_expected = [row['user_id'] for row in self.raw_source_content] to_id_expected = [row['item_id'] for row in self.raw_source_content] score_expected = [ float(row['stars']) for row in self.raw_source_content ] self.assertTrue( all(isinstance(from_id, str) for from_id in from_id_result)) self.assertTrue(all(isinstance(to_id, str) for to_id in to_id_result)) self.assertTrue(all( isinstance(score, float) for score in score_result)) self.assertEqual(from_id_expected, from_id_result) self.assertEqual(to_id_expected, to_id_result) self.assertEqual(score_expected, score_result)
def test_add_score_column_w_score_processor(self): ri = RatingsImporter(source=self.raw_source, from_id_column=0, to_id_column=1, score_column=4) ratings = ri.import_ratings() expected_columns = ['from_id', 'to_id', 'score'] result_columns = list(ratings.columns) self.assertEqual(expected_columns, result_columns) ratings_w_added_column = ri.add_score_column( 'review_title', column_name='text_polarity', score_processor=TextBlobSentimentAnalysis()) expected_columns = ['from_id', 'to_id', 'score', 'text_polarity'] result_columns = list(ratings_w_added_column.columns) self.assertEqual(expected_columns, result_columns) score_column_added = list(ratings['text_polarity']) self.assertTrue(-1 <= score <= 1 for score in score_column_added)
def test_exception_add_score_column(self): # Test exception score column can't be converted into float ri = RatingsImporter(source=self.raw_source, from_id_column='user_id', to_id_column='item_id', score_column='stars') with self.assertRaises(ValueError): ri.add_score_column('review_title', 'text')
def test_import_ratings_w_score_processor(self): ri = RatingsImporter(source=self.raw_source, from_id_column=0, to_id_column=1, score_column=4, score_processor=NumberNormalizer()) ratings = ri.import_ratings() expected_columns = ['from_id', 'to_id', 'score'] result_columns = list(ratings.columns) self.assertEqual(expected_columns, result_columns) score_result = list(ratings['score']) self.assertTrue(-1 <= score <= 1 for score in score_result)
def test_import_ratings_w_timestamp_index(self): ri = RatingsImporter(source=self.raw_source, from_id_column=0, to_id_column=1, score_column=4, timestamp_column=5) ratings = ri.import_ratings() expected_columns = ['from_id', 'to_id', 'score', 'timestamp'] result_columns = list(ratings.columns) self.assertEqual(expected_columns, result_columns) timestamp_result = list(ratings['timestamp']) timestamp_expected = [ row['timestamp'] for row in self.raw_source_content ] self.assertTrue( all(isinstance(timestamp, str) for timestamp in timestamp_result)) self.assertEqual(timestamp_expected, timestamp_result)
def test_import_ratings(self): RatingsImporter( source=JSONFile(file_path=file_path), output_directory="test_ratings", rating_configs=[ RatingsFieldConfig(field_name="review_title", processor=TextBlobSentimentAnalysis()), RatingsFieldConfig(field_name="text", processor=TextBlobSentimentAnalysis()), RatingsFieldConfig(field_name="stars", processor=NumberNormalizer(min_=0, max_=5)) ], from_field_name="user_id", to_field_name="item_id", timestamp_field_name="timestamp").import_ratings() """
def rating_config_run(config_dict: Dict): rating_configs = [] for field in config_dict["fields"]: class_name = field['processor'].pop('class') class_dict = dict_detector(field["processor"]) rating_configs.append( RatingsFieldConfig( field_name=field["field_name"], processor=runnable_instances[class_name](**class_dict))) args = {} if config_dict["source_type"] == 'sql': pass RatingsImporter(source=runnable_instances[config_dict["source_type"]]( file_path=config_dict["raw_source_path"], **args), output_directory=config_dict["output_directory"], rating_configs=rating_configs, from_field_name=config_dict["from_field_name"], to_field_name=config_dict["to_field_name"], timestamp_field_name=config_dict["timestamp_field_name"] ).import_ratings()
def test_import_ratings(self): file_path = '../../../datasets/test_import_ratings.json' try: with open(file_path): pass except FileNotFoundError: file_path = 'datasets/test_import_ratings.json' print(file_path) RatingsImporter(source=JSONFile(file_path=file_path), output_directory="test_ratings", rating_configs=[ RatingsFieldConfig(preference_field_name="review_title", processor=TextBlobSentimentAnalysis()), RatingsFieldConfig(preference_field_name="text", processor=TextBlobSentimentAnalysis()), RatingsFieldConfig(preference_field_name="stars", processor=NumberNormalizer(min_=0, max_=5))], from_field_name="user_id", to_field_name="item_id", timestamp_field_name="timestamp").import_ratings()
# solo esempio, non presente nel dataset """ title_review_config = RatingsFieldConfig( field_name='review_title', processor=TextBlobSentimentAnalysis() ) """ points_review_config = RatingsFieldConfig(field_name='points', processor=NumberNormalizer(min_=1, max_=5)) ratings_importer = RatingsImporter( source=CSVFile(ratings_filename), #cambia rating_configs=[points_review_config], from_field_name='user_id', to_field_name='item_id', timestamp_field_name='timestamp', ) ratings_frame = ratings_importer.import_ratings() print(ratings_frame) tfidf_classifier_config = ClassifierRecommender(item_field='Plot', field_representation='0', classifier='random_forest') classifier_recsys_config = RecSysConfig( users_directory=users_ca_dir, items_directory=items_ca_dir, ranking_algorithm=tfidf_classifier_config,
def test_exception_import_ratings(self): # Test exception column name not present in raw source ri = RatingsImporter(source=self.raw_source, from_id_column='not_existent', to_id_column='item_id', score_column='stars') with self.assertRaises(KeyError): ri.import_ratings() # Test exception column index not present in raw source ri = RatingsImporter(source=self.raw_source, from_id_column=99, to_id_column='item_id', score_column='stars') with self.assertRaises(IndexError): ri.import_ratings() # Test exception score column can't be converted into float ri = RatingsImporter(source=self.raw_source, from_id_column='user_id', to_id_column='item_id', score_column='review_title') with self.assertRaises(ValueError): ri.import_ratings()
def test_ratings_to_csv(self): ri = RatingsImporter(source=self.raw_source, from_id_column=0, to_id_column=1, score_column=4) ri.import_ratings() # Test save ri.imported_ratings_to_csv('csv_test/') self.assertTrue(os.path.isfile('csv_test/ratings_frame.csv')) # Test save first duplicate ri.imported_ratings_to_csv('csv_test/') self.assertTrue(os.path.isfile('csv_test/ratings_frame (1).csv')) # Test save second duplicate ri.imported_ratings_to_csv('csv_test/') self.assertTrue(os.path.isfile('csv_test/ratings_frame (2).csv')) # Test save with overwrite ri.imported_ratings_to_csv('csv_test/', overwrite=True) self.assertTrue(os.path.isfile('csv_test/ratings_frame.csv')) self.assertFalse(os.path.isfile('csv_test/ratings_frame (3).csv')) # Test save with custom name ri.imported_ratings_to_csv('csv_test/', 'ratings_custom_name') self.assertTrue(os.path.isfile('csv_test/ratings_custom_name.csv'))
from orange_cb_recsys.evaluation import RankingAlgEvalModel, NDCG, FNMeasure, KFoldPartitioning, GiniIndex, DeltaGap, \ ReportEvalModel from orange_cb_recsys.recsys import ClassifierRecommender, RecSysConfig, RecSys ratings_filename = '../../../datasets/ratings_example.json' items_ca_dir = '../../../orange_cb_recsys/movie_dir1605298315.4501655' users_ca_dir = '../../../datasets/examples/users_dir' stars_review_config = RatingsFieldConfig(field_name='stars', processor=NumberNormalizer(min_=1, max_=5)) ratings_importer = RatingsImporter( source=JSONFile(ratings_filename), rating_configs=[stars_review_config], from_field_name='user_id', to_field_name='item_id', timestamp_field_name='timestamp', ) ratings_frame = ratings_importer.import_ratings() print(ratings_frame) original_classifier_config = ClassifierRecommender(item_field='Plot', field_representation='0', classifier='random_forest') classifier_recsys_config = RecSysConfig( users_directory=users_ca_dir, items_directory=items_ca_dir, ranking_algorithm=original_classifier_config,