def test_predict(self): alg = NXPageRank() ratings = pd.DataFrame.from_records( [("A000", "tt0114576", 0.5, "54654675"), ("A000", "tt0112453", -0.5, "54654675"), ("A001", "tt0114576", 0.8, "54654675"), ("A001", "tt0112896", -0.4, "54654675"), ("A000", "tt0113041", 0.6, "54654675"), ("A002", "tt0112453", -0.2, "54654675"), ("A002", "tt0113497", 0.5, "54654675"), ("A003", "tt0112453", -0.8, "54654675")], columns=["from_id", "to_id", "score", "timestamp"]) try: path = "../../../contents/movielens_test1591885241.5520566" file = os.path.join(path, "tt0114576.xz") with lzma.open(file, "r") as content_file: pass except FileNotFoundError: path = "contents/movielens_test1591885241.5520566" rank = alg.predict('A000', ratings, 1, path, ['tt0114576']) logger.info('pg_rk results') for r in rank.keys(): logger.info('%s %s', str(r), str(rank[r]))
def extract_corpus(self, source: RawInformationSource, field_list: List[str], preprocessor_list: List[InformationProcessor]) -> list: """ Extracts the data from the source, from the fields specified in the field_list argument, and processes it using the processor_list passed as argument) Args: source (RawInformationSource): raw data on which the fitting process will be done field_list (List[str]): list of fields to consider from the raw data preprocessor_list (Union[List[InformationProcessor], InformationProcessor]): either a list or a single information processor that will be used to process the raw data in the fields defined in field list Returns: corpus (list): List of processed data """ corpus = [] # iter the source for i, doc in enumerate(source): logger.info("Document %d", i) doc_data = "" for field_name in field_list: # apply preprocessing and save the data in the list doc_data += " " + doc[field_name].lower() for preprocessor in preprocessor_list: doc_data = preprocessor.process(doc_data) corpus.append(self.process_data_granularity(doc_data)) return corpus
def perform(self, predictions: pd.DataFrame, truth: pd.DataFrame) -> float: """ Compute the correlation between the two ranks Args: Returns: (float): value of the specified correlation metric """ logger.info("Computing correlation") truth_labels = pd.Series(truth['to_id'].values) prediction_labels = pd.Series(predictions['to_id'].values) t_series = pd.Series() p_series = pd.Series() for t_index, t_value in truth_labels.iteritems(): for p_index, p_value in prediction_labels.iteritems(): if t_value == p_value: t_series = t_series.append(pd.Series(int(t_index))) p_series = p_series.append(pd.Series(int(p_index))) if t_series.size > 1: coef, p = 0, 0 if self.__method == 'pearson': coef, p = pearsonr(t_series, p_series) if self.__method == 'kendall': coef, p = kendalltau(t_series, p_series) if self.__method == 'spearman': coef, p = spearmanr(t_series, p_series) return coef return 0.0
def perform(self, predictions: pd.DataFrame, truth: pd.DataFrame) -> float: """ Compute the MAE metric Args: truth (pd.DataFrame): dataframe whose columns are: to_id, rating predictions (pd.DataFrame): dataframe whose columns are: to_id, rating Returns: (float): The Mean Average Error """ logger.info("Computing MAE") predictions = pd.Series(predictions['rating'].values, name="rating", dtype=float) truth = pd.Series(truth['rating'].values, name="rating", dtype=float) if len(predictions) != len(truth): if len(predictions) > len(truth): predictions = predictions[0:len(truth)] else: truth = truth[0:len(predictions)] abs_diff = (predictions - truth).apply(abs) return np.mean(abs_diff)
def perform(self, predictions: pd.DataFrame, truth: pd.DataFrame) -> float: """ Compute the RMSE metric Args: truth (pd.DataFrame): dataframe whose columns are: to_id, rating predictions (pd.DataFrame): dataframe whose columns are: to_id, rating Returns: (float): The Root Mean Squared Error """ logger.info("Computing RMSE") predictions = pd.Series(predictions['rating'].values, name="rating", dtype=float) truth = pd.Series(truth['rating'].values, name="rating", dtype=float) if len(predictions) != len(truth): if len(predictions) > len(truth): predictions = predictions[0:len(truth)] else: truth = truth[0:len(predictions)] diff = predictions - truth sq = np.square(diff) return np.sqrt(np.mean(sq))
def perform(self, predictions: pd.DataFrame, truth: pd.DataFrame) -> float: """ Compute the Mean Reciprocal Rank metric Where: • Q is the set of recommendation lists • rank(i) is the position of the first relevant item in the i-th recommendation list Args: truth (pd.DataFrame): dataframe whose columns are: to_id, rating predictions (pd.DataFrame): dataframe whose columns are: to_id, rating; it represents the ranking of all the items in the test set, first n will be considered relevant, with n equal to the number of relevant items in the test set Returns: (float): the mrr value """ logger.info("Computing MRR") prediction_labels, truth_labels = super()._get_labels(predictions, truth) mrr = 0 if len(truth_labels) == 0: return 0 for t_index, t_value in truth_labels.iteritems(): for p_index, p_value in prediction_labels.iteritems(): if t_value == p_value: mrr += (int(t_index) + 1) / (int(p_index) + 1) return mrr / len(truth_labels)
def extract_profile(user_id: str, graph: FullGraph) -> Dict: """ Extracts the user profile by accessing the node inside of the graph representing the user. Retrieves the item nodes to which the user gave a rating and returns a dictionary containing the successor nodes as keys and the weights in the graph for the edges between the user node and his successors as values Args: user_id (str): id for the user for which the profile will be extracted graph (FullGraph): graph from which the user profile will be extracted. In particular, the weights of the links connecting the user node representing the item and the successors will be extracted and will represent the values in the profile dictionary. A graph is passed instead of using the original graph in the class because the original graph isn't modified, so it isn't affected by modifications done during the prediction process (such as Feature Selection) Output example: if the user has rated two items ('I1', 'I2'), the user node corresponding to the user_id is selected (for example for user 'A') and each link connecting the user to the items is retrieved and the weight of said edge is extracted and added to the dictionary. If the weights of the edges A -> I1 and A -> I2 are respectively 0.2 and 0.4 the output will be a dictionary in the following form: {'I1': 0.2, 'I2': 0.4} Returns: profile (dict): dictionary with item successor nodes to the user as keys and weights of the edge connecting them in the graph as values """ successors = graph.get_successors(user_id) profile = {} for successor in successors: link_data = graph.get_link_data(user_id, successor) profile[successor] = link_data['weight'] logger.info('unpack %s, %s', str(successor), str(profile[successor])) return profile # {t: w for (f, t, w) in adj}
def test_predict(self): user_ratings = ratings[ratings['from_id'] == 'A001'] alg = NXPageRank() rank = alg.predict(user_ratings, 2) self.assertEqual(rank, {}) alg = NXPageRank(graph=graph) rank = alg.predict(user_ratings, 1) logger.info('pg_rk results') for r in rank.keys(): print(str(r) + " " + str(rank[r])) self.assertIn('tt0112453', rank.keys()) # alg = NXPageRank(graph=graph) # rank_fs = alg.predict('A001', ratings, 1, feature_selection_algorithm=NXFSPageRank()) # logger.info('pg_rk results') # for r in rank_fs.keys(): # print(str(r) + " " + str(rank_fs[r])) alg = NXPageRank(graph=graph, personalized=True) rank_personalized = alg.predict(user_ratings, 1) logger.info('pg_rk results') for r in rank_personalized.keys(): print(str(r) + " " + str(rank_personalized[r])) self.assertIn('tt0113041', rank_personalized)
def get_properties( self, raw_source: RawInformationSource) -> List[PropertiesDict]: logger.info("Extracting exogenous properties from local dataset") prop_dict_list = [] for raw_content in raw_source: if self.__field_name_list is None: prop_dict = raw_content else: prop_dict = { field: raw_content[field] for field in self.__field_name_list if raw_content.get(field) is not None } if self.mode == 'only_retrieved_evaluated': prop_dict = { field: prop_dict[field] for field in prop_dict if prop_dict[field] != '' } prop_dict_list.append(PropertiesDict(prop_dict)) return prop_dict_list
def get_properties(self, name: str, raw_content: Dict[str, object]) -> PropertiesDict: """ Execute the properties couple retrieval Args: name (str): string identifier of the returned properties object raw_content: represent a row in the dataset that is being processed Returns: PropertiesDict """ logger.info("Extracting exogenous properties") prop_dict = {} if self.mode == 'only_retrieved_evaluated': prop_dict = self.__get_only_retrieved_evaluated(raw_content) if self.mode == 'all_retrieved': prop_dict = self.__get_all_properties_retrieved(raw_content) if self.mode == 'original_retrieved': prop_dict = self.__get_original_retrieved(raw_content) if self.mode == 'all': prop_dict = self.__get_all_properties(raw_content) print(prop_dict) return PropertiesDict(name, prop_dict)
def perform(self, predictions: pd.DataFrame, truth: pd.DataFrame): """ Calculates the correlation between the two frames and store the correlation plot Args: truth (pd.DataFrame): original rating frame used for recsys config predictions (pd.DataFrame): dataframe with recommendations for multiple users """ logger.info("Computing pop recs correlation") def build_plot(popularities_, recommendations_, algorithm_name_, out_dir_): # Build and save the plot plt.scatter(popularities_, recommendations_, marker='o', s=20, c='orange', edgecolors='black', linewidths=0.05) plt.title('{}'.format(algorithm_name_)) plt.xlabel('Popularity') plt.ylabel('Recommendation frequency') plt.savefig('{}/pop-recs_{}.svg'.format(out_dir_, algorithm_name_)) plt.clf() # Calculating popularity by item items = truth[['to_id']].values.flatten() pop_by_items = Counter(items) # Calculating num of recommendations by item pop_by_items = pop_by_items.most_common() recs_by_item = Counter(predictions[['to_id']].values.flatten()) popularities = list() recommendations = list() popularities_no_zeros = list() recommendations_no_zeros = list() at_least_one_zero = False for item, pop in pop_by_items: num_of_recs = recs_by_item[item] popularities.append(pop) recommendations.append(num_of_recs) if num_of_recs != 0: popularities_no_zeros.append(pop) recommendations_no_zeros.append(num_of_recs) else: at_least_one_zero = True build_plot(popularities, recommendations, self.get_file_name(), self.get_output_directory()) if at_least_one_zero: build_plot(popularities_no_zeros, recommendations_no_zeros, self.get_file_name() + '-no-zeros', self.get_output_directory())
def get_unrated_items(items_directory: str, ratings) -> List[Content]: """ Gets the items that a user has not rated Args: items_directory (str): Path to the items directory ratings (pd.DataFrame): Ratings of a user Returns: unrated_items (List<Content>): List of items that the user has not rated """ directory_filename_list = [os.path.splitext(filename)[0] for filename in os.listdir(items_directory) if filename != 'search_index'] # logger.info("Getting filenames from IDs") # list of id of item without rating rated_items_filename_list = set([re.sub(r'[^\w\s]', '', item_id) for item_id in ratings.to_id]) #logger.info("Checking if unrated") filename_list = [item_id for item_id in directory_filename_list if item_id not in rated_items_filename_list] intersection = [x for x in filename_list if x in directory_filename_list] filename_list = intersection logger.info("Loading unrated items") unrated_items = [ load_content_instance(items_directory, item_id) for item_id in progbar(filename_list, prefix="Loading unrated items:")] return unrated_items
def perform(self, predictions: pd.DataFrame, truth: pd.DataFrame): """ Plot the long tail distribution for the truth frame Args: truth (pd.DataFrame): original rating frame used for recsys config predictions (pd.DataFrame): dataframe with recommendations for multiple users """ logger.info("Computing recs long tail distr") counts_by_item = Counter(truth[['to_id']].values.flatten()) ordered_item_count_pairs = counts_by_item.most_common() ordered_counts = list() for item_count_pair in ordered_item_count_pairs: ordered_counts.append(item_count_pair[1]) plt.plot(ordered_counts) plt.title('{}'.format(self.file_name)) plt.ylabel('Num of recommendations') plt.xlabel('Recommended items') plt.savefig('{}/recs-long-tail-distr_{}.svg'.format(self.output_directory, self.file_name)) plt.clf()
def show_progress(coll, milestones=10): processed = 0 for element in coll: yield element processed += 1 if processed % milestones == 0: logger.info('Processed %s user in the group', processed)
def get_properties(self, name: str, raw_content: Dict[str, object]) -> PropertiesDict: logger.info("Extracting exogenous properties") prop_dict = {} for i, k in enumerate(raw_content.keys()): field_name = k if self.__field_name_list is not None: if i < len(self.__field_name_list): field_name = self.__field_name_list[i] else: break if (field_name in raw_content.keys()): prop_dict[field_name] = str(raw_content[field_name]) else: prop_dict[field_name] = '' if self.mode == 'only_retrieved_evaluated' and prop_dict[ field_name] == '': prop_dict.pop(field_name) elif self.mode == 'all_retrieved' or self.mode == 'all' or self.mode == 'original_retrieved': continue return PropertiesDict(name, prop_dict)
def get_avg_pop_by_users(data: pd.DataFrame, pop_by_items: Dict[str, object], group: Set[str] = None) -> Dict[str, float]: """ Get the average popularity for each user in the DataFrame Args: data (pd.DataFrame): a pandas dataframe with columns = ['from_id', 'to_id', 'rating'] pop_by_items (Dict<str, object>): popularity for each label ('label', 'popularity') group (Set<str>): (optional) the set of users (from_id) Returns: avg_pop_by_users (Dict<str, float>): average popularity by user """ def show_progress(coll, milestones=10): processed = 0 for element in coll: yield element processed += 1 if processed % milestones == 0: logger.info('Processed %s user in the group', processed) if group is None: group = data[['from_id']].values.flatten() logger.info("Group length: %d", len(group)) series_by_user = { user: data[data.from_id == user].to_id.values.flatten() for user in show_progress(group) } avg_pop_by_users = { user: get_avg_pop(series_by_user[user], pop_by_items) for user in show_progress(group) } return avg_pop_by_users
def remove_from_categories(alias: str): cat = get_cat() if alias not in cat.keys(): logger.info('alias %s does not have a category', alias) else: cat.pop(alias) __serialize(cat, 'categories') logger.info('alias %s category successfully removed', alias)
def remove(alias: str): r_i = get() if alias not in r_i.keys(): logger.info('alias %s does not exist, runnable_instance not removed', alias) else: r_i.pop(alias) remove_from_categories(alias) __serialize(r_i, 'runnable_instances') logger.info('alias %s successfully removed', alias)
def show(categories: bool=False): if categories: cat = get_cat() for k in cat.keys(): logger.info('< %s : %s >', k, str(cat[k])) else: r_i = get() for k in r_i.keys(): logger.info('< %s : %s >', k, str(r_i[k]))
def create_content(self, raw_content: Dict): """ Creates a content processing every field in the specified way. This method is iteratively invoked by the fit method. Args: raw_content (dict): Raw data from which the content will be created Returns: content (Content): an instance of content with his fields Raises: general Exception """ if self.__config is None: raise Exception("You must set a config with set_config()") CONTENT_ID = "content_id" timestamp = self.__get_timestamp(raw_content) # construct id from the list of the fields that compound id content_id = id_merger(raw_content, self.__config.id_field_name_list) content = Content(content_id) for i, ex_retrieval in enumerate( self.__config.exogenous_properties_retrieval): lod_properties = ex_retrieval.get_properties(str(i), raw_content) content.append_exogenous_rep(str(i), lod_properties) if self.__indexer is not None: self.__indexer.new_content() self.__indexer.new_field(CONTENT_ID, content_id) interfaces = self.__config.get_interfaces() for interface in interfaces: interface.new_content() interface.new_field(CONTENT_ID, content_id) # produce for field_name in self.__config.get_field_name_list(): logger.info("Processing field: %s", field_name) # search for timestamp override on specific field content.append( field_name, self.__create_field(raw_content, field_name, content_id, timestamp)) if self.__indexer is not None: content.index_document_id = self.__indexer.serialize_content() for interface in interfaces: interface.serialize_content() return content
def extract_profile(self, user_id: str) -> Dict: adj = self.__fullgraph.get_adj(user_id) profile = {} #logger.info('unpack %s', str(adj)) for a in adj: #logger.info('unpack %s', str(a)) edge_data = self.__fullgraph.get_edge_data(user_id, a) profile[a] = edge_data['weight'] logger.info('unpack %s, %s', str(a), str(profile[a])) return profile #{t: w for (f, t, w) in adj}
def extract_profile(self, user_id: str) -> Dict: succ = self.__fullgraph.get_successors(user_id) profile = {} # logger.info('unpack %s', str(adj)) for a in succ: # logger.info('unpack %s', str(a)) link_data = self.__fullgraph.get_link_data(user_id, a) profile[a] = link_data['weight'] logger.info('unpack %s, %s', str(a), str(profile[a])) return profile # {t: w for (f, t, w) in adj}
def __serialize(r_i: Dict[str, object], label: str): logger.info("Serializing runnable_instances in utils dir",) path = '{}/{}.xz'.format(current_path, label) try: with lzma.open(path, "rb") as f: pass except FileNotFoundError: path = 'contents/{}.xz'.format(label) with lzma.open(path, 'wb') as f: pickle.dump(r_i, f)
def __create_field(self, raw_content: Dict, field_name: str, content_id: str, timestamp: str): """ Create a new field for the specified content Args: raw_content (Dict): Raw content for the new field field_name (str): Name of the new field content_id (str): Id of the content to which add the field timestamp (str) Returns: field (ContentField) """ if isinstance(raw_content[field_name], list): timestamp = raw_content[field_name][1] field_data = raw_content[field_name][0] else: field_data = raw_content[field_name] # serialize for explanation memory_interface = self.__config.get_memory_interface(field_name) if memory_interface is not None: memory_interface.new_field(field_name, field_data) # produce representations field = ContentField(field_name, timestamp) pipeline_list = list( enumerate(self.__config.get_pipeline_list(field_name))) for i, pipeline in pipeline_list: logger.info("processing representation %d", i) if isinstance(pipeline.content_technique, CollectionBasedTechnique): field.append( str(i), self.__create_representation_CBT(str(i), field_name, content_id, pipeline)) elif isinstance(pipeline.content_technique, SingleContentTechnique): field.append( str(i), self.__create_representation(str(i), field_data, pipeline)) elif isinstance(pipeline.content_technique, SearchIndexing): self.__invoke_indexing_technique(field_name, field_data, pipeline, content_id) elif pipeline.content_technique is None: self.__decode_field_data(field, str(i), field_data) return field
def __dataset_refactor(self): for field_name in self.__config.get_field_name_list(): for pipeline in self.__config.get_pipeline_list(field_name): technique = pipeline.get_content_technique() if isinstance(technique, CollectionBasedTechnique): logger.info("Creating collection for technique: %s on field %s, " "representation: %s", technique, field_name, pipeline) technique.set_field_need_refactor(field_name) technique.set_pipeline_need_refactor(str(pipeline)) technique.set_processor_list(pipeline.get_preprocessor_list()) technique.dataset_refactor( self.__config.get_source(), self.__config.get_id_field_name())
def serialize(self, output_directory: str): """ Serialize a content instance using lzma compression algorithm, so the file extension is .xz Args: output_directory (str): Name of the directory in which serialize """ logger.info("Serializing content %s in %s", self.__content_id, output_directory) file_name = re.sub(r'[^\w\s]', '', self.__content_id) path = os.path.join(output_directory, file_name + '.xz') with lzma.open(path, 'wb') as f: pickle.dump(self, f)
def perform(self, predictions: pd.DataFrame, truth: pd.DataFrame) -> pd.DataFrame: """ Compute the Delta - GAP (Group Average Popularity) metric Args: truth (pd.DataFrame): original rating frame used for recsys config predictions (pd.DataFrame): dataframe with recommendations for multiple users Returns: results (pd.DataFrame): each row contains ('from_id', 'delta-gap') """ pop_items = popular_items(score_frame=truth) user_groups = split_user_in_groups(score_frame=predictions, groups=self.user_groups, pop_items=pop_items) items = predictions[['to_id']].values.flatten() logger.info("Computing pop by items") pop_by_items = Counter(items) logger.info("Computing recs avg pop by users") recs_avg_pop_by_users = get_avg_pop_by_users(predictions, pop_by_items) recommended_users = set(truth[['from_id']].values.flatten()) score_frame = pd.DataFrame(columns=['user_group', 'delta-gap']) for group_name in user_groups: logger.info("Computing avg pop by users profiles for delta gap") avg_pop_by_users_profiles = get_avg_pop_by_users(truth, pop_by_items, user_groups[group_name]) logger.info("Computing delta gap for group: %s" % group_name) recs_gap = calculate_gap(group=user_groups[group_name].intersection(recommended_users), avg_pop_by_users=recs_avg_pop_by_users) profile_gap = calculate_gap(group=user_groups[group_name], avg_pop_by_users=avg_pop_by_users_profiles) group_delta_gap = calculate_delta_gap(recs_gap=recs_gap, profile_gap=profile_gap) score_frame = score_frame.append(pd.DataFrame({'user_group': [group_name], 'delta-gap': [group_delta_gap]}), ignore_index=True) return score_frame
def fit(self): """ This method performs the rating prediction evaluation by initializing internally a recommender system that produces recommendations for all the users in the directory specified in the configuration phase. The evaluation is performed by creating a training set, and a test set with its corresponding truth base. The rating prediction will be computed on every item in the test eet. Returns: prediction_metric_results: has a 'from' column, representing the user_ids for which the metrics was computed, and then one different column for every metric performed. The returned DataFrames contain one row per user, and the corresponding metric values are given by the mean of the values obtained for that user. """ # initialize recommender to call for prediction computing recsys = RecSys(self.config) # get all users in specified directory logger.info("Loading user instances") user_id_list = [ os.path.splitext(filename)[0] for filename in os.listdir(self.config.users_directory)] # define results structure prediction_metric_results = pd.DataFrame() # calculate metrics on prediction algorithm results if self.config.score_prediction_algorithm is None: raise ValueError("You must set score prediction algorithm to compute this eval model") for user_id in user_id_list: logger.info("User %s", user_id) logger.info("Loading user ratings") user_ratings = self.config.rating_frame[ self.config.rating_frame['from_id'] == user_id] user_ratings = user_ratings.sort_values(['to_id'], ascending=True) try: self.partitioning.dataframe = user_ratings except ValueError: continue for partition_index in self.partitioning: result_dict = {} logger.info("Computing prediction metrics") train = user_ratings.iloc[partition_index[0]] test = user_ratings.iloc[partition_index[1]] test = remove_not_existent_items(test, self.config.items_directory) predictions = recsys.fit_eval_predict(user_id, train, test) for metric in self.metrics: result_dict[str(metric)] = metric.perform(predictions, test) prediction_metric_results.append(result_dict, ignore_index=True) prediction_metric_results = prediction_metric_results.groupby('from').mean().reset_index() return prediction_metric_results
def get_properties(self, raw_source: RawInformationSource) -> List[EntitiesProp]: """ Produces a list of EntitiesProp objects for every raw content in the raw source where . An Entity Prop object is basically a dict where the keys are the entity linked (since there can be multiple entities in a field) and values are properties retrieved from BabelPy for that entity. EXAMPLE: properties_list = [EntityProp(), EntityProp(), ...] EntityProp.value -> {'DiCaprio': {'babelSynsetID': ..., ...},'Nolan': {'babelSynsetID: ..., ...}, ...} """ properties_list = [] logger.info("Doing Entity Linking with BabelFy") for raw_content in progbar(raw_source, max_value=len(list(raw_source))): data_to_disambiguate = check_not_tokenized( raw_content[self.__field_to_link]) self.__babel_client.babelfy(data_to_disambiguate) properties_content = {} try: if self.__babel_client.merged_entities is not None: for entity in self.__babel_client.merged_entities: properties_entity = { 'babelSynsetID': '', 'DBPediaURL': '', 'BabelNetURL': '', 'score': '', 'coherenceScore': '', 'globalScore': '', 'source': '' } for key in properties_entity: if entity.get(key) is not None: properties_entity[key] = entity[key] properties_content[entity['text']] = properties_entity properties_list.append(EntitiesProp(properties_content)) except AttributeError: raise AttributeError( "BabelFy limit reached! Insert an api key or change it if you inserted one!" ) return properties_list
def show_progress(coll, milestones=100): """ Yields the elements contained in coll and prints to video how many have been processed Args: coll (list): List that contains the ratings to process milestones (int): Tells to the method how often he has to print an update. For example, if milestones = 100, for every 100 items processed the method will print an update """ processed = 0 for element in coll: yield element processed += 1 if processed % milestones == 0: logger.info('Processed %s elements', processed)