Ejemplo n.º 1
0
    def test_predict(self):
        alg = NXPageRank()
        ratings = pd.DataFrame.from_records(
            [("A000", "tt0114576", 0.5, "54654675"),
             ("A000", "tt0112453", -0.5, "54654675"),
             ("A001", "tt0114576", 0.8, "54654675"),
             ("A001", "tt0112896", -0.4, "54654675"),
             ("A000", "tt0113041", 0.6, "54654675"),
             ("A002", "tt0112453", -0.2, "54654675"),
             ("A002", "tt0113497", 0.5, "54654675"),
             ("A003", "tt0112453", -0.8, "54654675")],
            columns=["from_id", "to_id", "score", "timestamp"])

        try:
            path = "../../../contents/movielens_test1591885241.5520566"
            file = os.path.join(path, "tt0114576.xz")
            with lzma.open(file, "r") as content_file:
                pass
        except FileNotFoundError:
            path = "contents/movielens_test1591885241.5520566"

        rank = alg.predict('A000', ratings, 1, path, ['tt0114576'])
        logger.info('pg_rk results')
        for r in rank.keys():
            logger.info('%s %s', str(r), str(rank[r]))
    def extract_corpus(self, source: RawInformationSource,
                       field_list: List[str],
                       preprocessor_list: List[InformationProcessor]) -> list:
        """
        Extracts the data from the source, from the fields specified in the field_list argument, and processes it
        using the processor_list passed as argument)

        Args:
            source (RawInformationSource): raw data on which the fitting process will be done
            field_list (List[str]): list of fields to consider from the raw data
            preprocessor_list (Union[List[InformationProcessor], InformationProcessor]): either a list or a single
                information processor that will be used to process the raw data in the fields defined in field list

        Returns:
            corpus (list): List of processed data
        """
        corpus = []
        # iter the source
        for i, doc in enumerate(source):
            logger.info("Document %d", i)
            doc_data = ""
            for field_name in field_list:
                # apply preprocessing and save the data in the list
                doc_data += " " + doc[field_name].lower()
            for preprocessor in preprocessor_list:
                doc_data = preprocessor.process(doc_data)
            corpus.append(self.process_data_granularity(doc_data))
        return corpus
Ejemplo n.º 3
0
    def perform(self, predictions: pd.DataFrame, truth: pd.DataFrame) -> float:
        """
        Compute the correlation between the two ranks

        Args:

        Returns:
            (float): value of the specified correlation metric
        """
        logger.info("Computing correlation")

        truth_labels = pd.Series(truth['to_id'].values)
        prediction_labels = pd.Series(predictions['to_id'].values)

        t_series = pd.Series()
        p_series = pd.Series()
        for t_index, t_value in truth_labels.iteritems():
            for p_index, p_value in prediction_labels.iteritems():
                if t_value == p_value:
                    t_series = t_series.append(pd.Series(int(t_index)))
                    p_series = p_series.append(pd.Series(int(p_index)))
        if t_series.size > 1:
            coef, p = 0, 0
            if self.__method == 'pearson':
                coef, p = pearsonr(t_series, p_series)
            if self.__method == 'kendall':
                coef, p = kendalltau(t_series, p_series)
            if self.__method == 'spearman':
                coef, p = spearmanr(t_series, p_series)

            return coef
        return 0.0
Ejemplo n.º 4
0
    def perform(self, predictions: pd.DataFrame, truth: pd.DataFrame) -> float:
        """
        Compute the MAE metric
        
        Args:
              truth (pd.DataFrame): dataframe whose columns are: to_id, rating
              predictions (pd.DataFrame): dataframe whose columns are: to_id, rating

        Returns:
            (float): The Mean Average Error
        """
        logger.info("Computing MAE")

        predictions = pd.Series(predictions['rating'].values,
                                name="rating",
                                dtype=float)
        truth = pd.Series(truth['rating'].values, name="rating", dtype=float)

        if len(predictions) != len(truth):
            if len(predictions) > len(truth):
                predictions = predictions[0:len(truth)]
            else:
                truth = truth[0:len(predictions)]
        abs_diff = (predictions - truth).apply(abs)
        return np.mean(abs_diff)
Ejemplo n.º 5
0
    def perform(self, predictions: pd.DataFrame, truth: pd.DataFrame) -> float:
        """
        Compute the RMSE metric
        
        Args:
              truth (pd.DataFrame): dataframe whose columns are: to_id, rating
              predictions (pd.DataFrame): dataframe whose columns are: to_id, rating

        Returns:
            (float): The Root Mean Squared Error
        """
        logger.info("Computing RMSE")

        predictions = pd.Series(predictions['rating'].values,
                                name="rating",
                                dtype=float)
        truth = pd.Series(truth['rating'].values, name="rating", dtype=float)

        if len(predictions) != len(truth):
            if len(predictions) > len(truth):
                predictions = predictions[0:len(truth)]
            else:
                truth = truth[0:len(predictions)]
        diff = predictions - truth
        sq = np.square(diff)
        return np.sqrt(np.mean(sq))
    def perform(self, predictions: pd.DataFrame, truth: pd.DataFrame) -> float:
        """
        Compute the Mean Reciprocal Rank metric
        

        Where:
            • Q is the set of recommendation lists
            • rank(i) is the position of the first relevant item in the i-th recommendation list

        Args:
              truth (pd.DataFrame): dataframe whose columns are: to_id, rating
              predictions (pd.DataFrame): dataframe whose columns are: to_id, rating;
                  it represents the ranking of all the items in the test set,
                  first n will be considered relevant,
                  with n equal to the number of relevant items in the test set

        Returns:
            (float): the mrr value
        """
        logger.info("Computing MRR")

        prediction_labels, truth_labels = super()._get_labels(predictions, truth)

        mrr = 0

        if len(truth_labels) == 0:
            return 0
        for t_index, t_value in truth_labels.iteritems():
            for p_index, p_value in prediction_labels.iteritems():
                if t_value == p_value:
                    mrr += (int(t_index) + 1) / (int(p_index) + 1)
        return mrr / len(truth_labels)
Ejemplo n.º 7
0
    def extract_profile(user_id: str, graph: FullGraph) -> Dict:
        """
        Extracts the user profile by accessing the node inside of the graph representing the user.
        Retrieves the item nodes to which the user gave a rating and returns a dictionary containing
        the successor nodes as keys and the weights in the graph for the edges between the user node
        and his successors as values

        Args:
            user_id (str): id for the user for which the profile will be extracted
            graph (FullGraph): graph from which the user profile will be extracted. In particular, the weights
                of the links connecting the user node representing the item and the successors will be
                extracted and will represent the values in the profile dictionary. A graph is passed instead
                of using the original graph in the class because the original graph isn't modified, so it isn't
                affected by modifications done during the prediction process (such as Feature Selection)

        Output example: if the user has rated two items ('I1', 'I2'), the user node corresponding to the user_id
        is selected (for example for user 'A') and each link connecting the user to the items is retrieved and the
        weight of said edge is extracted and added to the dictionary. If the weights of the edges A -> I1 and
        A -> I2 are respectively 0.2 and 0.4 the output will be a dictionary in the following form:
        {'I1': 0.2, 'I2': 0.4}

        Returns:
            profile (dict): dictionary with item successor nodes to the user as keys and weights of the edge
                connecting them in the graph as values
        """
        successors = graph.get_successors(user_id)
        profile = {}
        for successor in successors:
            link_data = graph.get_link_data(user_id, successor)
            profile[successor] = link_data['weight']
            logger.info('unpack %s, %s', str(successor), str(profile[successor]))
        return profile  # {t: w for (f, t, w) in adj}
Ejemplo n.º 8
0
    def test_predict(self):
        user_ratings = ratings[ratings['from_id'] == 'A001']
        alg = NXPageRank()
        rank = alg.predict(user_ratings, 2)
        self.assertEqual(rank, {})

        alg = NXPageRank(graph=graph)
        rank = alg.predict(user_ratings, 1)
        logger.info('pg_rk results')
        for r in rank.keys():
            print(str(r) + " " + str(rank[r]))

        self.assertIn('tt0112453', rank.keys())

        # alg = NXPageRank(graph=graph)
        # rank_fs = alg.predict('A001', ratings, 1, feature_selection_algorithm=NXFSPageRank())
        # logger.info('pg_rk results')
        # for r in rank_fs.keys():
        #     print(str(r) + " " + str(rank_fs[r]))

        alg = NXPageRank(graph=graph, personalized=True)
        rank_personalized = alg.predict(user_ratings, 1)
        logger.info('pg_rk results')
        for r in rank_personalized.keys():
            print(str(r) + " " + str(rank_personalized[r]))

        self.assertIn('tt0113041', rank_personalized)
    def get_properties(
            self, raw_source: RawInformationSource) -> List[PropertiesDict]:

        logger.info("Extracting exogenous properties from local dataset")
        prop_dict_list = []
        for raw_content in raw_source:

            if self.__field_name_list is None:
                prop_dict = raw_content
            else:
                prop_dict = {
                    field: raw_content[field]
                    for field in self.__field_name_list
                    if raw_content.get(field) is not None
                }

            if self.mode == 'only_retrieved_evaluated':
                prop_dict = {
                    field: prop_dict[field]
                    for field in prop_dict if prop_dict[field] != ''
                }

            prop_dict_list.append(PropertiesDict(prop_dict))

        return prop_dict_list
Ejemplo n.º 10
0
    def get_properties(self, name: str, raw_content: Dict[str, object]) -> PropertiesDict:
        """
        Execute the properties couple retrieval

        Args:
            name (str): string identifier of the returned properties object
            raw_content: represent a row in the dataset that
                is being processed

        Returns:
            PropertiesDict
        """
        logger.info("Extracting exogenous properties")
        prop_dict = {}
        if self.mode == 'only_retrieved_evaluated':
            prop_dict = self.__get_only_retrieved_evaluated(raw_content)

        if self.mode == 'all_retrieved':
            prop_dict = self.__get_all_properties_retrieved(raw_content)

        if self.mode == 'original_retrieved':
            prop_dict = self.__get_original_retrieved(raw_content)

        if self.mode == 'all':
            prop_dict = self.__get_all_properties(raw_content)

        print(prop_dict)

        return PropertiesDict(name, prop_dict)
Ejemplo n.º 11
0
    def perform(self, predictions: pd.DataFrame, truth: pd.DataFrame):
        """
        Calculates the correlation between the two frames and store
        the correlation plot

        Args:
              truth (pd.DataFrame): original rating frame used for recsys config
              predictions (pd.DataFrame): dataframe with recommendations for multiple users
        """

        logger.info("Computing pop recs correlation")

        def build_plot(popularities_, recommendations_, algorithm_name_,
                       out_dir_):
            # Build and save the plot
            plt.scatter(popularities_,
                        recommendations_,
                        marker='o',
                        s=20,
                        c='orange',
                        edgecolors='black',
                        linewidths=0.05)
            plt.title('{}'.format(algorithm_name_))
            plt.xlabel('Popularity')
            plt.ylabel('Recommendation frequency')
            plt.savefig('{}/pop-recs_{}.svg'.format(out_dir_, algorithm_name_))
            plt.clf()

        # Calculating popularity by item
        items = truth[['to_id']].values.flatten()
        pop_by_items = Counter(items)

        # Calculating num of recommendations by item
        pop_by_items = pop_by_items.most_common()
        recs_by_item = Counter(predictions[['to_id']].values.flatten())
        popularities = list()
        recommendations = list()
        popularities_no_zeros = list()
        recommendations_no_zeros = list()

        at_least_one_zero = False
        for item, pop in pop_by_items:
            num_of_recs = recs_by_item[item]

            popularities.append(pop)
            recommendations.append(num_of_recs)

            if num_of_recs != 0:
                popularities_no_zeros.append(pop)
                recommendations_no_zeros.append(num_of_recs)
            else:
                at_least_one_zero = True

        build_plot(popularities, recommendations, self.get_file_name(),
                   self.get_output_directory())

        if at_least_one_zero:
            build_plot(popularities_no_zeros, recommendations_no_zeros,
                       self.get_file_name() + '-no-zeros',
                       self.get_output_directory())
Ejemplo n.º 12
0
def get_unrated_items(items_directory: str, ratings) -> List[Content]:
    """
    Gets the items that a user has not rated

    Args:
        items_directory (str): Path to the items directory
        ratings (pd.DataFrame): Ratings of a user

    Returns:
        unrated_items (List<Content>): List of items that the user has not rated
    """
    directory_filename_list = [os.path.splitext(filename)[0]
                               for filename in os.listdir(items_directory)
                               if filename != 'search_index']

    # logger.info("Getting filenames from IDs")
    # list of id of item without rating
    rated_items_filename_list = set([re.sub(r'[^\w\s]', '', item_id) for item_id in ratings.to_id])

    #logger.info("Checking if unrated")
    filename_list = [item_id for item_id in directory_filename_list if
                     item_id not in rated_items_filename_list]

    intersection = [x for x in filename_list if x in directory_filename_list]
    filename_list = intersection

    logger.info("Loading unrated items")
    unrated_items = [
        load_content_instance(items_directory, item_id)
        for item_id in progbar(filename_list, prefix="Loading unrated items:")]

    return unrated_items
Ejemplo n.º 13
0
    def perform(self, predictions: pd.DataFrame, truth: pd.DataFrame):
        """
        Plot the long tail distribution for the truth frame
        Args:
              truth (pd.DataFrame): original rating frame used for recsys config
              predictions (pd.DataFrame): dataframe with recommendations for multiple users
        """
        logger.info("Computing recs long tail distr")

        counts_by_item = Counter(truth[['to_id']].values.flatten())
        ordered_item_count_pairs = counts_by_item.most_common()

        ordered_counts = list()
        for item_count_pair in ordered_item_count_pairs:
            ordered_counts.append(item_count_pair[1])

        plt.plot(ordered_counts)
        plt.title('{}'.format(self.file_name))
        plt.ylabel('Num of recommendations')
        plt.xlabel('Recommended items')

        plt.savefig('{}/recs-long-tail-distr_{}.svg'.format(self.output_directory,
                                                            self.file_name))

        plt.clf()
Ejemplo n.º 14
0
 def show_progress(coll, milestones=10):
     processed = 0
     for element in coll:
         yield element
         processed += 1
         if processed % milestones == 0:
             logger.info('Processed %s user in the group', processed)
Ejemplo n.º 15
0
    def get_properties(self, name: str,
                       raw_content: Dict[str, object]) -> PropertiesDict:

        logger.info("Extracting exogenous properties")
        prop_dict = {}
        for i, k in enumerate(raw_content.keys()):
            field_name = k
            if self.__field_name_list is not None:
                if i < len(self.__field_name_list):
                    field_name = self.__field_name_list[i]
                else:
                    break

            if (field_name in raw_content.keys()):
                prop_dict[field_name] = str(raw_content[field_name])
            else:
                prop_dict[field_name] = ''

            if self.mode == 'only_retrieved_evaluated' and prop_dict[
                    field_name] == '':
                prop_dict.pop(field_name)
            elif self.mode == 'all_retrieved' or self.mode == 'all' or self.mode == 'original_retrieved':
                continue

        return PropertiesDict(name, prop_dict)
Ejemplo n.º 16
0
def get_avg_pop_by_users(data: pd.DataFrame,
                         pop_by_items: Dict[str, object],
                         group: Set[str] = None) -> Dict[str, float]:
    """
    Get the average popularity for each user in the DataFrame

    Args:
        data (pd.DataFrame): a pandas dataframe with columns = ['from_id', 'to_id', 'rating']
        pop_by_items (Dict<str, object>): popularity for each label ('label', 'popularity')
        group (Set<str>): (optional) the set of users (from_id)

    Returns:
        avg_pop_by_users (Dict<str, float>): average popularity by user
    """
    def show_progress(coll, milestones=10):
        processed = 0
        for element in coll:
            yield element
            processed += 1
            if processed % milestones == 0:
                logger.info('Processed %s user in the group', processed)

    if group is None:
        group = data[['from_id']].values.flatten()
    logger.info("Group length: %d", len(group))
    series_by_user = {
        user: data[data.from_id == user].to_id.values.flatten()
        for user in show_progress(group)
    }
    avg_pop_by_users = {
        user: get_avg_pop(series_by_user[user], pop_by_items)
        for user in show_progress(group)
    }

    return avg_pop_by_users
Ejemplo n.º 17
0
def remove_from_categories(alias: str):
    cat = get_cat()
    if alias not in cat.keys():
        logger.info('alias %s does not have a category', alias)
    else:
        cat.pop(alias)
        __serialize(cat, 'categories')
        logger.info('alias %s category successfully removed', alias)
Ejemplo n.º 18
0
def remove(alias: str):
    r_i = get()
    if alias not in r_i.keys():
        logger.info('alias %s does not exist, runnable_instance not removed', alias)
    else:
        r_i.pop(alias)
        remove_from_categories(alias)
        __serialize(r_i, 'runnable_instances')
        logger.info('alias %s successfully removed', alias)
Ejemplo n.º 19
0
def show(categories: bool=False):
    if categories:
        cat = get_cat()
        for k in cat.keys():
            logger.info('< %s : %s >', k, str(cat[k]))
    else:
        r_i = get()
        for k in r_i.keys():
            logger.info('< %s : %s >', k, str(r_i[k]))
Ejemplo n.º 20
0
    def create_content(self, raw_content: Dict):
        """
        Creates a content processing every field in the specified way.
        This method is iteratively invoked by the fit method.

        Args:
            raw_content (dict): Raw data from which the content will be created

        Returns:
            content (Content): an instance of content with his fields

        Raises:
            general Exception
        """

        if self.__config is None:
            raise Exception("You must set a config with set_config()")

        CONTENT_ID = "content_id"

        timestamp = self.__get_timestamp(raw_content)

        # construct id from the list of the fields that compound id
        content_id = id_merger(raw_content, self.__config.id_field_name_list)
        content = Content(content_id)

        for i, ex_retrieval in enumerate(
                self.__config.exogenous_properties_retrieval):
            lod_properties = ex_retrieval.get_properties(str(i), raw_content)
            content.append_exogenous_rep(str(i), lod_properties)

        if self.__indexer is not None:
            self.__indexer.new_content()
            self.__indexer.new_field(CONTENT_ID, content_id)

        interfaces = self.__config.get_interfaces()
        for interface in interfaces:
            interface.new_content()
            interface.new_field(CONTENT_ID, content_id)

        # produce
        for field_name in self.__config.get_field_name_list():
            logger.info("Processing field: %s", field_name)
            # search for timestamp override on specific field
            content.append(
                field_name,
                self.__create_field(raw_content, field_name, content_id,
                                    timestamp))

        if self.__indexer is not None:
            content.index_document_id = self.__indexer.serialize_content()

        for interface in interfaces:
            interface.serialize_content()

        return content
Ejemplo n.º 21
0
 def extract_profile(self, user_id: str) -> Dict:
     adj = self.__fullgraph.get_adj(user_id)
     profile = {}
     #logger.info('unpack %s', str(adj))
     for a in adj:
         #logger.info('unpack %s', str(a))
         edge_data = self.__fullgraph.get_edge_data(user_id, a)
         profile[a] = edge_data['weight']
         logger.info('unpack %s, %s', str(a), str(profile[a]))
     return profile  #{t: w for (f, t, w) in adj}
Ejemplo n.º 22
0
 def extract_profile(self, user_id: str) -> Dict:
     succ = self.__fullgraph.get_successors(user_id)
     profile = {}
     # logger.info('unpack %s', str(adj))
     for a in succ:
         # logger.info('unpack %s', str(a))
         link_data = self.__fullgraph.get_link_data(user_id, a)
         profile[a] = link_data['weight']
         logger.info('unpack %s, %s', str(a), str(profile[a]))
     return profile  # {t: w for (f, t, w) in adj}
Ejemplo n.º 23
0
def __serialize(r_i: Dict[str, object], label: str):
    logger.info("Serializing runnable_instances in utils dir",)

    path = '{}/{}.xz'.format(current_path, label)
    try:
        with lzma.open(path, "rb") as f:
            pass
    except FileNotFoundError:
        path = 'contents/{}.xz'.format(label)

    with lzma.open(path, 'wb') as f:
        pickle.dump(r_i, f)
Ejemplo n.º 24
0
    def __create_field(self, raw_content: Dict, field_name: str,
                       content_id: str, timestamp: str):
        """
        Create a new field for the specified content

        Args:
            raw_content (Dict): Raw content for the new field
            field_name (str): Name of the new field
            content_id (str): Id of the content to which add the field
            timestamp (str)

        Returns:
            field (ContentField)
        """
        if isinstance(raw_content[field_name], list):
            timestamp = raw_content[field_name][1]
            field_data = raw_content[field_name][0]
        else:
            field_data = raw_content[field_name]

        # serialize for explanation
        memory_interface = self.__config.get_memory_interface(field_name)
        if memory_interface is not None:
            memory_interface.new_field(field_name, field_data)

        # produce representations
        field = ContentField(field_name, timestamp)

        pipeline_list = list(
            enumerate(self.__config.get_pipeline_list(field_name)))

        for i, pipeline in pipeline_list:
            logger.info("processing representation %d", i)
            if isinstance(pipeline.content_technique,
                          CollectionBasedTechnique):
                field.append(
                    str(i),
                    self.__create_representation_CBT(str(i), field_name,
                                                     content_id, pipeline))

            elif isinstance(pipeline.content_technique,
                            SingleContentTechnique):
                field.append(
                    str(i),
                    self.__create_representation(str(i), field_data, pipeline))
            elif isinstance(pipeline.content_technique, SearchIndexing):
                self.__invoke_indexing_technique(field_name, field_data,
                                                 pipeline, content_id)
            elif pipeline.content_technique is None:
                self.__decode_field_data(field, str(i), field_data)

        return field
Ejemplo n.º 25
0
    def __dataset_refactor(self):
        for field_name in self.__config.get_field_name_list():
            for pipeline in self.__config.get_pipeline_list(field_name):

                technique = pipeline.get_content_technique()
                if isinstance(technique, CollectionBasedTechnique):
                    logger.info("Creating collection for technique: %s on field %s, "
                                "representation: %s", technique, field_name, pipeline)
                    technique.set_field_need_refactor(field_name)
                    technique.set_pipeline_need_refactor(str(pipeline))
                    technique.set_processor_list(pipeline.get_preprocessor_list())
                    technique.dataset_refactor(
                        self.__config.get_source(), self.__config.get_id_field_name())
Ejemplo n.º 26
0
    def serialize(self, output_directory: str):
        """
        Serialize a content instance using lzma compression algorithm,
        so the file extension is .xz

        Args:
            output_directory (str): Name of the directory in which serialize
        """
        logger.info("Serializing content %s in %s", self.__content_id, output_directory)

        file_name = re.sub(r'[^\w\s]', '', self.__content_id)
        path = os.path.join(output_directory, file_name + '.xz')
        with lzma.open(path, 'wb') as f:
            pickle.dump(self, f)
Ejemplo n.º 27
0
    def perform(self, predictions: pd.DataFrame, truth: pd.DataFrame) -> pd.DataFrame:
        """
        Compute the Delta - GAP (Group Average Popularity) metric

        Args:
              truth (pd.DataFrame): original rating frame used for recsys config
              predictions (pd.DataFrame): dataframe with recommendations for multiple users

        Returns:
            results (pd.DataFrame): each row contains ('from_id', 'delta-gap')
        """

        pop_items = popular_items(score_frame=truth)
        user_groups = split_user_in_groups(score_frame=predictions, groups=self.user_groups, pop_items=pop_items)
        items = predictions[['to_id']].values.flatten()
        logger.info("Computing pop by items")
        pop_by_items = Counter(items)
        logger.info("Computing recs avg pop by users")
        recs_avg_pop_by_users = get_avg_pop_by_users(predictions, pop_by_items)

        recommended_users = set(truth[['from_id']].values.flatten())

        score_frame = pd.DataFrame(columns=['user_group', 'delta-gap'])
        for group_name in user_groups:
            logger.info("Computing avg pop by users profiles for delta gap")
            avg_pop_by_users_profiles = get_avg_pop_by_users(truth, pop_by_items, user_groups[group_name])
            logger.info("Computing delta gap for group: %s" % group_name)
            recs_gap = calculate_gap(group=user_groups[group_name].intersection(recommended_users),
                                     avg_pop_by_users=recs_avg_pop_by_users)
            profile_gap = calculate_gap(group=user_groups[group_name], avg_pop_by_users=avg_pop_by_users_profiles)
            group_delta_gap = calculate_delta_gap(recs_gap=recs_gap, profile_gap=profile_gap)
            score_frame = score_frame.append(pd.DataFrame({'user_group': [group_name], 'delta-gap': [group_delta_gap]}),
                                             ignore_index=True)
        return score_frame
Ejemplo n.º 28
0
    def fit(self):
        """
        This method performs the rating prediction evaluation by initializing internally
            a recommender system that produces recommendations for all the
            users in the directory specified in the configuration phase.
            The evaluation is performed by creating a training set,
            and a test set with its corresponding
            truth base. The rating prediction will be computed on every item in the test eet.

        Returns:
            prediction_metric_results: has a 'from' column, representing the user_ids for
                which the metrics was computed, and then one different column for every metric
                performed. The returned DataFrames contain one row per user, and the corresponding
                metric values are given by the mean of the values obtained for that user.
        """
        # initialize recommender to call for prediction computing
        recsys = RecSys(self.config)

        # get all users in specified directory
        logger.info("Loading user instances")
        user_id_list = [
            os.path.splitext(filename)[0]
            for filename in os.listdir(self.config.users_directory)]

        # define results structure
        prediction_metric_results = pd.DataFrame()

        # calculate metrics on prediction algorithm results
        if self.config.score_prediction_algorithm is None:
            raise ValueError("You must set score prediction algorithm to compute this eval model")

        for user_id in user_id_list:
            logger.info("User %s", user_id)
            logger.info("Loading user ratings")

            user_ratings = self.config.rating_frame[
                self.config.rating_frame['from_id'] == user_id]
            user_ratings = user_ratings.sort_values(['to_id'], ascending=True)

            try:
                self.partitioning.dataframe = user_ratings
            except ValueError:
                continue

            for partition_index in self.partitioning:
                result_dict = {}
                logger.info("Computing prediction metrics")
                train = user_ratings.iloc[partition_index[0]]
                test = user_ratings.iloc[partition_index[1]]
                test = remove_not_existent_items(test, self.config.items_directory)

                predictions = recsys.fit_eval_predict(user_id, train, test)
                for metric in self.metrics:
                    result_dict[str(metric)] = metric.perform(predictions, test)

                prediction_metric_results.append(result_dict, ignore_index=True)

        prediction_metric_results = prediction_metric_results.groupby('from').mean().reset_index()

        return prediction_metric_results
    def get_properties(self,
                       raw_source: RawInformationSource) -> List[EntitiesProp]:
        """
        Produces a list of EntitiesProp objects for every raw content in the raw source where .

        An Entity Prop object is basically a dict where the keys are the entity linked (since there can be multiple
        entities in a field) and values are properties retrieved from BabelPy for that entity.
        EXAMPLE:
            properties_list = [EntityProp(), EntityProp(), ...]

            EntityProp.value -> {'DiCaprio': {'babelSynsetID': ..., ...},'Nolan': {'babelSynsetID: ..., ...}, ...}

        """
        properties_list = []
        logger.info("Doing Entity Linking with BabelFy")
        for raw_content in progbar(raw_source,
                                   max_value=len(list(raw_source))):
            data_to_disambiguate = check_not_tokenized(
                raw_content[self.__field_to_link])

            self.__babel_client.babelfy(data_to_disambiguate)

            properties_content = {}
            try:
                if self.__babel_client.merged_entities is not None:

                    for entity in self.__babel_client.merged_entities:
                        properties_entity = {
                            'babelSynsetID': '',
                            'DBPediaURL': '',
                            'BabelNetURL': '',
                            'score': '',
                            'coherenceScore': '',
                            'globalScore': '',
                            'source': ''
                        }

                        for key in properties_entity:
                            if entity.get(key) is not None:
                                properties_entity[key] = entity[key]

                        properties_content[entity['text']] = properties_entity

                properties_list.append(EntitiesProp(properties_content))
            except AttributeError:
                raise AttributeError(
                    "BabelFy limit reached! Insert an api key or change it if you inserted one!"
                )

        return properties_list
Ejemplo n.º 30
0
def show_progress(coll, milestones=100):
    """
    Yields the elements contained in coll and prints to video how many have been processed
    Args:
        coll (list): List that contains the ratings to process
        milestones (int): Tells to the method how often he has to print an update. For
            example, if milestones = 100, for every 100 items processed the method will
            print an update
    """
    processed = 0
    for element in coll:
        yield element
        processed += 1
        if processed % milestones == 0:
            logger.info('Processed %s elements', processed)