Ejemplo n.º 1
0
    def fit(self):
        """
        Processes the creation of the contents and serializes the contents. This method starts the content production
        process and initializes everything that will be used to create said contents, their fields and their
        representations
        """
        # before starting the process, the content analyzer manin checks that there are no duplicate id cases
        # both in the field dictionary and in the exogenous representation list
        # this is done now and not recursively for each content during the creation process, in order to avoid starting
        # an operation that is going to fail
        try:
            self.__check_field_dict()
            self.__check_exogenous_representation_list()
        except ValueError as e:
            raise e

        # creates the directory where the data will be serialized and overwrites it if it already exists
        output_path = self.__config.output_directory
        if os.path.exists(output_path):
            shutil.rmtree(output_path)
        os.mkdir(output_path)

        contents_producer = ContentsProducer.get_instance()
        contents_producer.set_config(self.__config)
        created_contents = contents_producer.create_contents()

        if self.__config.export_json:
            json_path = os.path.join(self.__config.output_directory,
                                     'contents.json')
            with open(json_path, "w") as data:
                json.dump(created_contents, data, cls=ContentEncoder, indent=4)

        for content in progbar(created_contents,
                               prefix="Serializing contents: "):
            self.__serialize_content(content)
Ejemplo n.º 2
0
def get_rated_items(items_directory, ratings) -> List[Content]:
    """
    Gets the items that a user not rated

    Args:
        items_directory (str): Path to the items directory
        ratings (pd.DataFrame): Ratings of the user

    Returns:
        unrated_items (List<Content>): List of items that the user has rated
    """
    directory_filename_list = [os.path.splitext(filename)[0]
                               for filename in os.listdir(items_directory)
                               if filename != 'search_index']

    # logger.info("Getting filenames from IDs")
    # list of id of item without rating
    rated_items_filename_list = set([re.sub(r'[^\w\s]', '', item_id) for item_id in ratings.to_id])

    # logger.info("Checking if rated")
    filename_list = [item_id for item_id in directory_filename_list if
                     item_id in rated_items_filename_list]

    intersection = [x for x in filename_list if x in directory_filename_list]
    filename_list = intersection

    filename_list.sort()

    rated_items = [
        load_content_instance(items_directory, item_id)
        for item_id in progbar(filename_list, prefix="Loading rated items:")]

    return rated_items
Ejemplo n.º 3
0
    def import_ratings(self) -> pd.DataFrame:
        """
        Imports the ratings from the source and stores in a dataframe

        Returns:
            ratings_frame: pd.DataFrame
        """
        ratings_frame = {'from_id': [], 'to_id': [], 'score': [], 'timestamp': []}
        for row in progbar(list(self.__source), prefix="Importing ratings:"):

            ratings_frame['from_id'].append(self._get_field_data(self.from_id_column, row))

            ratings_frame['to_id'].append(self._get_field_data(self.to_id_column, row))

            if self.timestamp_column:
                ratings_frame['timestamp'].append(self._get_field_data(self.timestamp_column, row))

            ratings_frame['score'].append(self._get_field_data(self.score_column, row))

        if len(ratings_frame['timestamp']) == 0:
            del ratings_frame['timestamp']

        if self.score_processor:
            ratings_frame['score'] = self.score_processor.fit(ratings_frame['score'])
        else:
            ratings_frame['score'] = [float(score) for score in ratings_frame['score']]

        self.rating_frame = pd.DataFrame(ratings_frame)
        return self.rating_frame
Ejemplo n.º 4
0
    def populate_from_dataframe(self, source_frame: pd.DataFrame):
        """
        Populate the graph using a DataFrame.
        It must have a 'from_id', 'to_id' and 'score' column.

        We iterate every row, and create a weighted link for every user and item in the rating frame
        based on the score the user gave the item, creating the nodes if they don't exist.

        Args:
            source_frame (pd.DataFrame): the rating frame from where the graph will be populated
        """
        if self._check_columns(source_frame):
            for idx, row in progbar(source_frame.iterrows(),
                                    max_value=source_frame.__len__(),
                                    prefix="Populating Graph:"):
                self.add_user_node(row['from_id'])
                self.add_item_node(row['to_id'])
                if 'label' in source_frame.columns:
                    label = row['label']
                else:
                    label = self.get_default_score_label()
                self.add_link(row['from_id'], row['to_id'], row['score'],
                              label=label)
        else:
            raise ValueError('The source frame must contains at least \'from_id\', \'to_id\', \'score\' columns')
Ejemplo n.º 5
0
    def populate_from_dataframe(self, source_frame: pd.DataFrame):
        """
        Populate the graph using a DataFrame.
        It must have a 'from_id', 'to_id' and 'score' column.

        We iterate every row, and create a weighted link for every user and item in the rating frame
        based on the score the user gave the item, creating the nodes if they don't exist.
        We also add properties to 'item' nodes if the item_contents_dir is specified,
        and add properties to 'user' nodes if the user_contents_dir is specified.

        Args:
            source_frame (pd.DataFrame): the rating frame from where the graph will be populated
        """
        if self._check_columns(source_frame):
            for idx, row in progbar(source_frame.iterrows(),
                                    max_value=source_frame.__len__(),
                                    prefix="Populating Graph:"):

                self.add_user_node(row['from_id'])
                self.add_item_node(row['to_id'])
                self.add_link(row['from_id'], row['to_id'], self.normalize_score(row['score']),
                              label=self.get_default_score_label())
                if self.get_item_contents_dir() is not None:
                    self._add_item_properties(row)

                if self.get_user_contents_dir() is not None:
                    self._add_usr_properties(row)
        else:
            raise ValueError('The source frame must contains at least \'from_id\', \'to_id\', \'score\' columns')
Ejemplo n.º 6
0
    def eval_metrics(
            self,
            metric_list: List[Metric]) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Method which effectively evaluates recommendations generated with the list of metric passed as argument.

        It returns two Pandas DataFrame, the first one containing system results on all metrics specified, the second
        one containing each users results for every metric eligible

        Args:
            metric_list (List[Metric]): List of metric on which recommendations need to be evaluated

        Returns:
            Two pandas DataFrame, the first will contain the system result for every metric specified inside the metric
            list, the second one will contain every user results for every metric eligible
        """

        frames_to_concat = []

        eval_logger.info('Performing metrics chosen')

        for metric in progbar(metric_list,
                              prefix='Performing {}:',
                              substitute_with_current=True):

            metric_result_list = []

            if self._split_list is None:
                split_list = metric._get_pred_truth_list()
            else:
                split_list = self._split_list

            for split in split_list:
                if not split.pred.empty and not split.truth.empty:
                    from_id_valid = split.pred['from_id']
                    # Remove from truth item of which we do not have predictions
                    split.truth = split.truth.query(
                        'from_id in @from_id_valid')
                    metric_result = metric.perform(split)
                    metric_result_list.append(metric_result)

            total_results_metric = pd.concat(metric_result_list)

            if not total_results_metric.empty:
                total_results_metric = total_results_metric.groupby(
                    'from_id').mean()

                total_results_metric.index.name = 'from_id'

                frames_to_concat.append(total_results_metric)

        final_result = pd.concat(frames_to_concat, axis=1)

        system_results = final_result.loc[['sys']]
        each_user_result = final_result.drop(['sys'])

        each_user_result = each_user_result.dropna(axis=1, how='all')

        return system_results, each_user_result
Ejemplo n.º 7
0
    def eval_fit_recsys(cls, recsys: RecSys, split_list: List[Split],
                        test_items_list: List[pd.DataFrame]):
        """
        Method which is usually called by the 'PredictionCalculator' module that generates recommendation lists. For
        every user, items that will be ranked are specified by the 'test_items_list' parameter.

        Rankings generated will be stored into a class attribute (rank_truth_list), which is a list that contains
        Split objects: every object has two DataFrames, the first one has recommendation lists for every user,
        the second one has the 'ground truth' for every user.

        If the class attribute is non-empty, then the 'AlreadyFittedRecSys' exception is raised, so remember to
        clean the class attribute by calling the private method '_clean_pred_truth_list(...)' upon every new evaluation

        Args:
            recsys (RecSys): Recommender system which will generate predictions that will later be evaluated
            split_list (List[Split]): List of Split objects where every Split contains two DataFrames, the first has
                the 'train set' for every user, the second has the 'test set' for every user
            test_items_list (List[pd.DataFrame]): List of DataFrames, one for every Split object inside the split_list
                parameter, where every DataFrame contains for every user the list of items that must be ranked

        Raises:
            AlreadyFittedRecSys exception when the class attribute 'rank_truth_list' is non-empty, meaning that
            recommendation lists are already been calculated
        """

        if len(cls.rank_truth_list) != 0:
            raise AlreadyFittedRecSys

        for counter, (split, test_items_frame) in enumerate(zip(
                split_list, test_items_list),
                                                            start=1):

            train = split.train
            test = split.test

            rank_truth = Split()
            rank_truth.truth = test

            frame_to_concat = []
            user_list_to_fit = set(train.from_id)

            for user in progbar(
                    user_list_to_fit,
                    prefix='Calculating rank for user {} - split {}'.format(
                        '{}', counter),
                    substitute_with_current=True):

                user_ratings_train = train.loc[train['from_id'] == user]

                test_items = list(
                    test_items_frame.query('from_id == @user').to_id)

                result = recsys._eval_fit_rank(user_ratings_train, test_items)

                frame_to_concat.append(result)

            rank_truth.pred = pd.concat(frame_to_concat)

            cls.rank_truth_list.append(rank_truth)
    def get_properties(self,
                       raw_source: RawInformationSource) -> List[EntitiesProp]:
        """
        Produces a list of EntitiesProp objects for every raw content in the raw source where .

        An Entity Prop object is basically a dict where the keys are the entity linked (since there can be multiple
        entities in a field) and values are properties retrieved from BabelPy for that entity.
        EXAMPLE:
            properties_list = [EntityProp(), EntityProp(), ...]

            EntityProp.value -> {'DiCaprio': {'babelSynsetID': ..., ...},'Nolan': {'babelSynsetID: ..., ...}, ...}

        """
        properties_list = []
        logger.info("Doing Entity Linking with BabelFy")
        for raw_content in progbar(raw_source,
                                   max_value=len(list(raw_source))):
            data_to_disambiguate = check_not_tokenized(
                raw_content[self.__field_to_link])

            self.__babel_client.babelfy(data_to_disambiguate)

            properties_content = {}
            try:
                if self.__babel_client.merged_entities is not None:

                    for entity in self.__babel_client.merged_entities:
                        properties_entity = {
                            'babelSynsetID': '',
                            'DBPediaURL': '',
                            'BabelNetURL': '',
                            'score': '',
                            'coherenceScore': '',
                            'globalScore': '',
                            'source': ''
                        }

                        for key in properties_entity:
                            if entity.get(key) is not None:
                                properties_entity[key] = entity[key]

                        properties_content[entity['text']] = properties_entity

                properties_list.append(EntitiesProp(properties_content))
            except AttributeError:
                raise AttributeError(
                    "BabelFy limit reached! Insert an api key or change it if you inserted one!"
                )

        return properties_list
Ejemplo n.º 9
0
    def add_score_column(self, score_column: Union[str, int], column_name: str, score_processor: RatingProcessor = None):
        col_to_add = []
        for row in progbar(list(self.__source), prefix="Adding column {}:".format(column_name)):

            col_to_add.append(self._get_field_data(score_column, row))

        if score_processor:
            col_to_add = score_processor.fit(col_to_add)
        else:
            col_to_add = [float(score) for score in col_to_add]

        self.rating_frame[column_name] = col_to_add

        return self.rating_frame
Ejemplo n.º 10
0
    def populate_from_dataframe(self, source_frame: pd.DataFrame):
        """
        Populate the graph using a DataFrame.
        It must have a 'from_id', 'to_id' and 'score' column.

        The method will iterate for every row, and create a weighted link for every user and item in the rating frame
        based on the score the user gave the item, creating the nodes if they don't exist.
        We also add properties to 'item' nodes if the item_contents_dir is specified,
        and add properties to 'user' nodes if the user_contents_dir is specified.

        Args:
            source_frame (pd.DataFrame): the rating frame from where the graph will be populated
        """
        if self._check_columns(source_frame):
            for row in progbar(source_frame.to_dict('records'),
                               max_value=source_frame.__len__(),
                               prefix="Populating Graph:"):

                # If the node already exists then we don't add it and more importantly
                # we don't retrieve its exo prop if specified, since they are already been retireved
                # previously.
                if not self.node_exists(UserNode(row['from_id'])):
                    self.add_user_node(row['from_id'])
                    if self.get_user_contents_dir() is not None:
                        self._add_usr_properties(row)

                # If the node already exists then we don't add it and more importantly
                # we don't retrieve its exo prop if specified, since they are already been retireved
                # previously.
                if not self.node_exists(ItemNode(row['to_id'])):
                    self.add_item_node(row['to_id'])
                    if self.get_item_contents_dir() is not None:
                        self._add_item_properties(row)

                if 'label' in source_frame.columns:
                    label = row['label']
                else:
                    label = self.get_default_score_label()

                self.add_link(UserNode(row['from_id']),
                              ItemNode(row['to_id']),
                              row['score'],
                              label=label)
        else:
            raise ValueError(
                'The source frame must contains at least \'from_id\', \'to_id\', \'score\' columns'
            )
Ejemplo n.º 11
0
    def eval_metrics(
            self,
            metric_list: List[Metric]) -> Tuple[pd.DataFrame, pd.DataFrame]:

        frames_to_concat = []

        eval_logger.info('Performing metrics chosen')

        for metric in progbar(metric_list,
                              prefix='Performing {}:',
                              substitute_with_current=True):

            metric_result_list = []

            if self._split_list is None:
                split_list = metric._get_pred_truth_list()
            else:
                split_list = self._split_list

            for split in split_list:
                if not split.pred.empty and not split.truth.empty:
                    from_id_valid = split.pred['from_id']
                    # Remove from truth item of which we do not have predictions
                    split.truth = split.truth.query(
                        'from_id in @from_id_valid')
                    metric_result = metric.perform(split)
                    metric_result_list.append(metric_result)

            total_results_metric = pd.concat(metric_result_list)

            if not total_results_metric.empty:
                total_results_metric = total_results_metric.groupby(
                    'from_id').mean()

                total_results_metric.index.name = 'from_id'

                frames_to_concat.append(total_results_metric)

        final_result = pd.concat(frames_to_concat, axis=1)

        system_results = final_result.loc[['sys']]
        each_user_result = final_result.drop(['sys'])

        each_user_result = each_user_result.dropna(axis=1, how='all')

        return system_results, each_user_result
Ejemplo n.º 12
0
    def split_all(self, ratings: pd.DataFrame, user_id_list: Set[str]):
        """
        Method that effectively splits the 'ratings' parameter into 'train set' and 'test set'.
        It must be specified a 'user_id_list' parameter so that the method will do the splitting only for the users
        specified inside the list.

        Args:
            ratings (pd.DataFrame): The DataFrame which contains the interactions of the users that must be splitted
                into 'train set' and 'test set'
            user_id_list (Set[str]): The set of users for which splitting will be done
        """

        split_list = []

        eval_logger.info("Performing {} on ratings of every user".format(
            str(self._partition_technique)))
        for user_id in progbar(user_id_list,
                               prefix="Current user - {}:",
                               substitute_with_current=True):
            user_ratings = ratings[ratings['from_id'] == user_id]
            try:
                user_splits_list = self._split_single(user_ratings)
            except PartitionError as e:
                eval_logger.warning(
                    str(e) + "\nThe user {} will be skipped".format(user_id))
                continue

            if len(split_list) != 0:
                for user_split, total_split in zip(user_splits_list,
                                                   split_list):
                    total_split.train = pd.concat(
                        [total_split.train, user_split.train])
                    total_split.test = pd.concat(
                        [total_split.test, user_split.test])
            else:
                for user_split in user_splits_list:
                    split_list.append(user_split)  # Only executed once

        return split_list
Ejemplo n.º 13
0
    def split_all(self, ratings: pd.DataFrame, user_id_list: Set[str]):

        split_list = []

        eval_logger.info("Performing {} on ratings of every user".format(str(self._partition_technique)))
        for user_id in progbar(user_id_list, prefix="Current user - {}:", substitute_with_current=True):
            user_ratings = ratings[ratings['from_id'] == user_id]
            try:
                user_splits_list = self._split_single(user_ratings)
            except PartitionError as e:
                eval_logger.warning(str(e) + "\nThe user {} will be skipped".format(user_id))
                continue

            if len(split_list) != 0:
                for user_split, total_split in zip(user_splits_list, split_list):
                    total_split.train = pd.concat([total_split.train, user_split.train])
                    total_split.test = pd.concat([total_split.test, user_split.test])
            else:
                for user_split in user_splits_list:
                    split_list.append(user_split)  # Only executed once

        return split_list
Ejemplo n.º 14
0
    def eval_fit_recsys(cls, recsys: RecSys, split_list: List[Split],
                        test_items_list: List[pd.DataFrame]):
        if len(cls.rank_truth_list) != 0:
            raise AlreadyFittedRecSys

        for counter, (split, test_items_frame) in enumerate(zip(
                split_list, test_items_list),
                                                            start=1):

            train = split.train
            test = split.test

            rank_truth = Split()
            rank_truth.truth = test

            frame_to_concat = []
            user_list_to_fit = set(train.from_id)

            for user in progbar(
                    user_list_to_fit,
                    prefix='Calculating rank for user {} - split {}'.format(
                        '{}', counter),
                    substitute_with_current=True):

                user_ratings_train = train.loc[train['from_id'] == user]

                test_items = list(
                    test_items_frame.query('from_id == @user').to_id)

                result = recsys._eval_fit_rank(user_ratings_train, test_items)

                frame_to_concat.append(result)

            rank_truth.pred = pd.concat(frame_to_concat)

            cls.rank_truth_list.append(rank_truth)