Beispiel #1
0
def get_item_average_overall_rating(reviews, item_id, apply_filter=True):
    """
    Returns the average of the overall ratings that this user has given to
    every item he/she has reviewed

    :param reviews: a list of reviews
    :param item_id: the ID of the user
    :return: the average (or mean) of all the overall ratings that this has
    given to all the items he/she has reviewed
    """
    if apply_filter:
        user_reviews =\
            ETLUtils.filter_records(reviews, 'offering_id', [item_id])
    else:
        user_reviews = reviews

    ratings_sum = 0.
    ratings_count = len(user_reviews)

    for review in user_reviews:
        ratings_sum += float(review['overall_rating'])

    average_rating = float(ratings_sum) / float(ratings_count)

    return average_rating
Beispiel #2
0
def get_user_item_ratings(reviews, user_id, apply_filter=False):
    """
    Returns a dictionary that contains the items that the given user has rated,
    where the key of the dictionary is the ID of the item and the value is the
    rating that user_id has given to that item

    :param reviews: a list of reviews
    :param user_id: the ID of the user
    :param apply_filter: a boolean that indicates if the reviews have to be
    filtered by user_id or not. In other word this boolean indicates if the list
    contains reviews from several users or not. If it does contains reviews from
    other users, those have to be removed
    :return: a dictionary with the items that the given user has rated
    """

    if apply_filter:
        user_reviews = ETLUtils.filter_records(reviews, 'user_id', [user_id])
    else:
        user_reviews = reviews

    if not user_reviews:
        return {}

    data_frame = DataFrame(user_reviews)
    column = 'offering_id'
    counts = data_frame.groupby(column).mean()

    items = counts.index.get_level_values(0).tolist()
    items_ratings = {}

    for item, mean in zip(items, counts['overall_rating']):
        items_ratings[item] = mean

    return items_ratings
Beispiel #3
0
    def test_filter_records(self):

        field = 'offering_id'
        values = [1, 3, 5]

        expected_result = [{
            'user_id': 'U1',
            'offering_id': 1,
            'overall_rating': 5.0
        }, {
            'user_id': 'U1',
            'offering_id': 3,
            'overall_rating': 5.0
        }, {
            'user_id': 'U2',
            'offering_id': 1,
            'overall_rating': 5.0
        }, {
            'user_id': 'U2',
            'offering_id': 3,
            'overall_rating': 5.0
        }, {
            'user_id': 'U2',
            'offering_id': 5,
            'overall_rating': 9.0
        }]

        actual_result = ETLUtils.filter_records(reviews_matrix_5_short, field,
                                                values)

        self.assertEqual(expected_result, actual_result)
Beispiel #4
0
def get_item_average_overall_rating(reviews, item_id, apply_filter=True):
    """
    Returns the average of the overall ratings that this user has given to
    every item he/she has reviewed

    :param reviews: a list of reviews
    :param item_id: the ID of the user
    :return: the average (or mean) of all the overall ratings that this has
    given to all the items he/she has reviewed
    """
    if apply_filter:
        user_reviews =\
            ETLUtils.filter_records(reviews, 'offering_id', [item_id])
    else:
        user_reviews = reviews

    ratings_sum = 0.
    ratings_count = len(user_reviews)

    for review in user_reviews:
        ratings_sum += float(review['overall_rating'])

    average_rating = float(ratings_sum) / float(ratings_count)

    return average_rating
Beispiel #5
0
    def calculate_sparsity(self):
        """
        Returns the percentage of missing ratings in the list of reviews of this
        ReviewsDatasetAnalyzer

        :return: the rate of missing ratings
        (i.e. number of missing ratings / (number of items * number of users))
        :raise ValueError: in case an empty list is given
        """
        if not self.reviews:
            raise ValueError(
                'Can not determine the sparsity for an empty list')

        user_ids = extractor.get_groupby_list(self.reviews,
                                              Constants.USER_ID_FIELD)
        item_ids = extractor.get_groupby_list(self.reviews,
                                              Constants.ITEM_ID_FIELD)

        non_missing_reviews = 0.
        total_expected_reviews = len(user_ids) * len(item_ids)

        for user in user_ids:
            user_reviews = ETLUtils.filter_records(self.reviews,
                                                   Constants.USER_ID_FIELD,
                                                   [user])
            user_items = extractor.get_groupby_list(user_reviews,
                                                    Constants.ITEM_ID_FIELD)

            non_missing_reviews += len(
                set(item_ids).intersection(set(user_items)))

        return 1 - non_missing_reviews / total_expected_reviews
Beispiel #6
0
def initialize_users(reviews, is_multi_criteria):
    """
    Builds a dictionary containing all the users in the reviews. Each user
    contains information about its average overall rating, the list of reviews
    that user has made, and the cluster the user belongs to

    :param reviews: the list of reviews
    :return: a dictionary with the users initialized, the keys of the
    dictionaries are the users' ID
    """
    user_ids = get_groupby_list(reviews, 'user_id')
    user_dictionary = {}

    for user_id in user_ids:
        user = User(user_id)
        user_reviews = ETLUtils.filter_records(reviews, 'user_id', [user_id])
        user.average_overall_rating = get_user_average_overall_rating(
            user_reviews, user_id, apply_filter=False)
        user.item_ratings = get_user_item_ratings(user_reviews, user_id)
        user_dictionary[user_id] = user

        if is_multi_criteria:
            user.item_multi_ratings =\
                get_user_item_multi_ratings(user_reviews, user_id)

    return user_dictionary
Beispiel #7
0
    def get_records_to_predict_topn(self):
        print('get_records_to_predict_topn: %s' %
              time.strftime("%Y/%m/%d-%H:%M:%S"))

        self.top_n_evaluator = TopNEvaluator(self.records, self.test_records,
                                             Constants.ITEM_TYPE, 10,
                                             Constants.TOPN_NUM_ITEMS)
        self.top_n_evaluator.initialize()
        self.important_records = self.top_n_evaluator.important_records

        if Constants.TEST_CONTEXT_REVIEWS_ONLY:
            self.important_records = ETLUtils.filter_records(
                self.important_records, Constants.HAS_CONTEXT_FIELD, [True])

            self.records_to_predict =\
                self.top_n_evaluator.get_records_to_predict()

        if Constants.MAX_SAMPLE_TEST_SET is not None:
            print('important_records %d' % len(self.important_records))
            if len(self.important_records) > Constants.MAX_SAMPLE_TEST_SET:
                self.important_records = random.sample(
                    self.important_records, Constants.MAX_SAMPLE_TEST_SET)
            else:
                message = 'WARNING max_sample_test_set is greater than the ' \
                          'number of important records'
                print(message)

        self.top_n_evaluator.important_records = self.important_records
        self.records_to_predict = self.top_n_evaluator.get_records_to_predict()
        self.test_records = None
        gc.collect()
Beispiel #8
0
def initialize_cluster_users(reviews, significant_criteria_ranges=None):
    """
    Builds a dictionary containing all the users in the reviews. Each user
    contains information about its average overall rating, the list of reviews
    that user has made, and the cluster the user belongs to

    :param reviews: the list of reviews
    :return: a dictionary with the users initialized, the keys of the
    dictionaries are the users' ID
    """
    user_ids = get_groupby_list(reviews, 'user_id')
    user_dictionary = {}

    for user_id in user_ids:
        user = User(user_id)
        user_reviews = ETLUtils.filter_records(reviews, 'user_id', [user_id])
        user.average_overall_rating = get_user_average_overall_rating(
            user_reviews, user_id, apply_filter=False)
        user.criteria_weights = get_criteria_weights(user_reviews,
                                                     user_id,
                                                     apply_filter=False)
        _, user.cluster = get_significant_criteria(
            user.criteria_weights, significant_criteria_ranges)
        user.item_ratings = get_user_item_ratings(user_reviews, user_id)
        user.item_multi_ratings = get_user_item_multi_ratings(
            user_reviews, user_id)
        user_dictionary[user_id] = user

    # print('Total users: %i' % len(user_ids))

    return user_dictionary
Beispiel #9
0
def get_user_item_ratings(reviews, user_id, apply_filter=False):
    """
    Returns a dictionary that contains the items that the given user has rated,
    where the key of the dictionary is the ID of the item and the value is the
    rating that user_id has given to that item

    :param reviews: a list of reviews
    :param user_id: the ID of the user
    :param apply_filter: a boolean that indicates if the reviews have to be
    filtered by user_id or not. In other word this boolean indicates if the list
    contains reviews from several users or not. If it does contains reviews from
    other users, those have to be removed
    :return: a dictionary with the items that the given user has rated
    """

    if apply_filter:
        user_reviews = ETLUtils.filter_records(reviews, 'user_id', [user_id])
    else:
        user_reviews = reviews

    if not user_reviews:
        return {}

    data_frame = DataFrame(user_reviews)
    column = 'offering_id'
    counts = data_frame.groupby(column).mean()

    items = counts.index.get_level_values(0).tolist()
    items_ratings = {}

    for item, mean in zip(items, counts['overall_rating']):
        items_ratings[item] = mean

    return items_ratings
    def calculate_sparsity(self):
        """
        Returns the percentage of missing ratings in the list of reviews of this
        ReviewsDatasetAnalyzer

        :return: the rate of missing ratings
        (i.e. number of missing ratings / (number of items * number of users))
        :raise ValueError: in case an empty list is given
        """
        if not self.reviews:
            raise ValueError("Can not determine the sparsity for an empty list")

        user_ids = extractor.get_groupby_list(self.reviews, "user_id")
        item_ids = extractor.get_groupby_list(self.reviews, "offering_id")

        non_missing_reviews = 0.0
        total_expected_reviews = len(user_ids) * len(item_ids)

        for user in user_ids:
            user_reviews = ETLUtils.filter_records(self.reviews, "user_id", [user])
            user_items = extractor.get_groupby_list(user_reviews, "offering_id")

            non_missing_reviews += len(set(item_ids).intersection(set(user_items)))

        return 1 - non_missing_reviews / total_expected_reviews
Beispiel #11
0
def initialize_cluster_users(reviews, significant_criteria_ranges=None):
    """
    Builds a dictionary containing all the users in the reviews. Each user
    contains information about its average overall rating, the list of reviews
    that user has made, and the cluster the user belongs to

    :param reviews: the list of reviews
    :return: a dictionary with the users initialized, the keys of the
    dictionaries are the users' ID
    """
    user_ids = get_groupby_list(reviews, 'user_id')
    user_dictionary = {}

    for user_id in user_ids:
        user = User(user_id)
        user_reviews = ETLUtils.filter_records(reviews, 'user_id', [user_id])
        user.average_overall_rating = get_user_average_overall_rating(
            user_reviews, user_id, apply_filter=False)
        user.criteria_weights = get_criteria_weights(
            user_reviews, user_id, apply_filter=False)
        _, user.cluster = get_significant_criteria(
            user.criteria_weights, significant_criteria_ranges)
        user.item_ratings = get_user_item_ratings(user_reviews, user_id)
        user.item_multi_ratings = get_user_item_multi_ratings(user_reviews, user_id)
        user_dictionary[user_id] = user

    # print('Total users: %i' % len(user_ids))

    return user_dictionary
Beispiel #12
0
def initialize_users(reviews, is_multi_criteria):
    """
    Builds a dictionary containing all the users in the reviews. Each user
    contains information about its average overall rating, the list of reviews
    that user has made, and the cluster the user belongs to

    :param reviews: the list of reviews
    :return: a dictionary with the users initialized, the keys of the
    dictionaries are the users' ID
    """
    user_ids = get_groupby_list(reviews, 'user_id')
    user_dictionary = {}

    for user_id in user_ids:
        user = User(user_id)
        user_reviews = ETLUtils.filter_records(reviews, 'user_id', [user_id])
        user.average_overall_rating = get_user_average_overall_rating(
            user_reviews, user_id, apply_filter=False)
        user.item_ratings = get_user_item_ratings(user_reviews, user_id)
        user_dictionary[user_id] = user

        if is_multi_criteria:
            user.item_multi_ratings =\
                get_user_item_multi_ratings(user_reviews, user_id)

    return user_dictionary
Beispiel #13
0
def analyze_context_records():
    records = ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE)
    records = ETLUtils.filter_records(records, 'context_type', ['context'])

    print('num records: %d' % len(records))

    for record in records:
        print(record[Constants.TEXT_FIELD])
Beispiel #14
0
def remove_items_with_low_reviews(reviews, min_reviews):
    """
    Returns a copy of the original reviews list without the reviews of hotels
    that just have been reviewed once

    :param reviews: a list of reviews
    :return: a copy of the original reviews list without the reviews of hotels
    that just have been reviewed once
    """
    items = get_item_list(reviews, min_reviews)
    return ETLUtils.filter_records(reviews, 'offering_id', items)
Beispiel #15
0
def remove_items_with_low_reviews(reviews, min_reviews):
    """
    Returns a copy of the original reviews list without the reviews of hotels
    that just have been reviewed once

    :param reviews: a list of reviews
    :return: a copy of the original reviews list without the reviews of hotels
    that just have been reviewed once
    """
    items = get_item_list(reviews, min_reviews)
    return ETLUtils.filter_records(reviews, 'offering_id', items)
Beispiel #16
0
    def export(self):
        print('export: %s' % time.strftime("%Y/%d/%m-%H:%M:%S"))
        I = my_i

        if REVIEW_TYPE:
            self.records = ETLUtils.filter_records(
                self.records, constants.PREDICTED_CLASS_FIELD, [REVIEW_TYPE])
            self.test_records = ETLUtils.filter_records(
                self.test_records, constants.PREDICTED_CLASS_FIELD,
                [REVIEW_TYPE])

        with open(USER_ITEM_MAP_FILE, 'rb') as read_file:
            user_item_map = pickle.load(read_file)

        self.top_n_evaluator = TopNEvaluator(
            self.records, self.test_records, DATASET, 10, I)
        self.top_n_evaluator.initialize(user_item_map)
        self.records_to_predict = self.top_n_evaluator.get_records_to_predict()
        # self.top_n_evaluator.export_records_to_predict(RECORDS_TO_PREDICT_FILE)
        self.important_records = self.top_n_evaluator.important_records
Beispiel #17
0
    def get_records_to_predict_rmse(self):
        print('get_records_to_predict_rmse: %s' %
              time.strftime("%Y/%m/%d-%H:%M:%S"))
        self.important_records = self.test_records

        if Constants.TEST_CONTEXT_REVIEWS_ONLY:
            self.important_records = ETLUtils.filter_records(
                self.important_records, Constants.HAS_CONTEXT_FIELD, [True])

        self.records_to_predict = self.important_records
        self.test_records = None
        gc.collect()
Beispiel #18
0
def remove_users_with_low_reviews(reviews, min_reviews):
    """
    Returns a copy of the original reviews list without the reviews made by
    users who have made less than min_reviews reviews

    :param reviews: a list of reviews
    :param min_reviews: the minimum number of reviews a user must have in order
    not to be removed from the reviews list
    :return: a copy of the original reviews list without the reviews made by
    users who have made less than min_reviews reviews
    """
    users = get_user_list(reviews, min_reviews)
    return ETLUtils.filter_records(reviews, 'user_id', users)
def get_unknown_items(reviews, user_id, num_unknown=1000):
    item_ids = extractor.get_groupby_list(reviews, 'offering_id')
    user_reviews = ETLUtils.filter_records(reviews, 'user_id', [user_id])
    user_items = extractor.get_groupby_list(user_reviews, 'offering_id')

    # We calculate which are the items that the user hasn't rated, which is the
    # items that are in the list item_ids but not in the list user_items
    s = set(user_items)
    unknown_items = [x for x in item_ids if x not in s]
    # TODO: Uncomment this line, the items have to be shuffled
    # shuffle(unknown_items)

    return unknown_items[:num_unknown]
Beispiel #20
0
def remove_users_with_low_reviews(reviews, min_reviews):
    """
    Returns a copy of the original reviews list without the reviews made by
    users who have made less than min_reviews reviews

    :param reviews: a list of reviews
    :param min_reviews: the minimum number of reviews a user must have in order
    not to be removed from the reviews list
    :return: a copy of the original reviews list without the reviews made by
    users who have made less than min_reviews reviews
    """
    users = get_user_list(reviews, min_reviews)
    return ETLUtils.filter_records(reviews, 'user_id', users)
Beispiel #21
0
def get_unknown_items(reviews, user_id, num_unknown=1000):
    item_ids = extractor.get_groupby_list(reviews, 'offering_id')
    user_reviews = ETLUtils.filter_records(reviews, 'user_id', [user_id])
    user_items = extractor.get_groupby_list(user_reviews, 'offering_id')

    # We calculate which are the items that the user hasn't rated, which is the
    # items that are in the list item_ids but not in the list user_items
    s = set(user_items)
    unknown_items = [x for x in item_ids if x not in s]
    # TODO: Uncomment this line, the items have to be shuffled
    # shuffle(unknown_items)

    return unknown_items[:num_unknown]
Beispiel #22
0
def create_topic_model_with_context_records():

    processed_records_file = Constants.generate_file_name(
        'classified_processed_reviews', 'json', Constants.CACHE_FOLDER, None,
        None, False, True)
    records = ETLUtils.load_json_file(processed_records_file)
    print('records length: %d' % len(records))

    context_records = ETLUtils.filter_records(records, 'context_type', ['context'])
    print('context records length: %d' % len(context_records))
    context_specific_records = ETLUtils.filter_records(context_records, 'predicted_class', ['specific'])
    print('context specific records length: %d' % len(context_specific_records))

    for i in range(len(context_specific_records)):
        # print('%d:\t%s' % (i, context_records[i]['text']))
        print('%d:\t%s' % (i, context_specific_records[i]['bow']))

    for i in range(1, len(context_records)+1):

        Constants.update_properties({Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: i})
        context_extractor = \
            topic_model_creator.create_topic_model(records, None, None)

        topic_data = []

        for topic in range(Constants.TOPIC_MODEL_NUM_TOPICS):
            result = {}
            result['topic_id'] = topic
            result.update(split_topic(context_extractor.print_topic_model(
                num_terms=Constants.TOPIC_MODEL_STABILITY_NUM_TERMS)[topic]))
            result['ratio'] = context_extractor.topic_ratio_map[topic]
            result['weighted_frequency'] = \
                context_extractor.topic_weighted_frequency_map[topic]
            topic_data.append(result)

        file_name = Constants.generate_file_name(
            'manual_topic_model', 'xlsx', Constants.DATASET_FOLDER, None, None, True)
        generate_excel_file(topic_data, file_name)
Beispiel #23
0
def remove_items_with_low_reviews(reviews, min_reviews):
    """
    Returns a copy of the original reviews list without the reviews of hotels
    that just have been reviewed once

    :param reviews: a list of reviews
    :return: a copy of the original reviews list without the reviews of hotels
    that just have been reviewed once
    """
    items = get_item_list(reviews, min_reviews)
    all_items = get_groupby_list(reviews, Constants.ITEM_ID_FIELD)
    num_discarded_items = len(all_items) - len(items)
    print('Discarded %d items due to low count' % num_discarded_items)
    return ETLUtils.filter_records(reviews, Constants.ITEM_ID_FIELD, items)
Beispiel #24
0
def get_user_item_reviews(records, user_id, apply_filter=False):

    if apply_filter:
        user_records = ETLUtils.filter_records(records, 'user_id', [user_id])
    else:
        user_records = records

    if not user_records:
        return {}

    items_reviews = {}

    for record in user_records:
        items_reviews[record['offering_id']] = record['text']

    return items_reviews
Beispiel #25
0
    def test_filter_records(self):

        field = 'offering_id'
        values = [1, 3, 5]

        expected_result = [
            {'user_id': 'U1', 'offering_id': 1, 'overall_rating': 5.0},
            {'user_id': 'U1', 'offering_id': 3, 'overall_rating': 5.0},
            {'user_id': 'U2', 'offering_id': 1, 'overall_rating': 5.0},
            {'user_id': 'U2', 'offering_id': 3, 'overall_rating': 5.0},
            {'user_id': 'U2', 'offering_id': 5, 'overall_rating': 9.0}
        ]

        actual_result = ETLUtils.filter_records(reviews_matrix_5_short, field, values)

        self.assertEqual(expected_result, actual_result)
Beispiel #26
0
def get_user_item_reviews(records, user_id, apply_filter=False):

    if apply_filter:
        user_records = ETLUtils.filter_records(records, 'user_id', [user_id])
    else:
        user_records = records

    if not user_records:
        return {}

    items_reviews = {}

    for record in user_records:
        items_reviews[record['offering_id']] = record['text']

    return items_reviews
Beispiel #27
0
    def remove_foreign_reviews(self):

        print('%s: remove foreign reviews' % time.strftime("%Y/%m/%d-%H:%M:%S"))

        initial_length = len(self.records)

        self.records = ETLUtils.filter_records(
            self.records, Constants.LANGUAGE_FIELD, [Constants.LANGUAGE])
        final_length = len(self.records)
        removed_records_count = initial_length - final_length
        percentage = removed_records_count / float(initial_length) * 100

        msg = "A total of %d (%f%%) records were removed because their " \
              "language was not '%s'" % (
                removed_records_count, percentage, Constants.LANGUAGE)
        print(msg)
Beispiel #28
0
def get_user_item_contexts(records, lda_model, user_id, apply_filter=False):

    if apply_filter:
        user_records = ETLUtils.filter_records(records, 'user_id', [user_id])
    else:
        user_records = records

    if not user_records:
        return {}

    items_reviews = {}

    for record in user_records:
        review_text = record['text']
        context = get_topic_distribution(review_text, lda_model)
        items_reviews[record['offering_id']] = context

    return items_reviews
Beispiel #29
0
def create_user_item_map(records):
    user_ids = extractor.get_groupby_list(records, Constants.USER_ID_FIELD)
    user_item_map = {}
    user_count = 0

    for user_id in user_ids:
        user_records =\
            ETLUtils.filter_records(records, Constants.USER_ID_FIELD, [user_id])
        user_items =\
            extractor.get_groupby_list(user_records, Constants.ITEM_ID_FIELD)
        user_item_map[user_id] = user_items
        user_count += 1

        # print("user count %d" % user_count),
        print 'user count: {0}\r'.format(user_count),

    print

    return user_item_map
Beispiel #30
0
def create_user_item_map(records):
    user_ids = extractor.get_groupby_list(records, constants.USER_ID_FIELD)
    user_item_map = {}
    user_count = 0

    for user_id in user_ids:
        user_records =\
            ETLUtils.filter_records(records, constants.USER_ID_FIELD, [user_id])
        user_items =\
            extractor.get_groupby_list(user_records, constants.ITEM_ID_FIELD)
        user_item_map[user_id] = user_items
        user_count += 1

        # print("user count %d" % user_count),
        print 'user count: {0}\r'.format(user_count),

    print

    return user_item_map
Beispiel #31
0
def get_user_item_multi_ratings(reviews, user_id, apply_filter=False):
    """
    Returns a dictionary that contains the items that the given user has rated,
    where the key of the dictionary is the ID of the item and the value is the
    rating that user_id has given to that item. This function returns the
    multi-criteria ratings the user has made.

    :param reviews: a list of reviews
    :param user_id: the ID of the user
    :param apply_filter: a boolean that indicates if the reviews have to be
    filtered by user_id or not. In other word this boolean indicates if the list
    contains reviews from several users or not. If it does contains reviews from
    other users, those have to be removed
    :return: a dictionary with the items that the given user has rated
    """

    if apply_filter:
        user_reviews = \
            ETLUtils.filter_records(reviews, Constants.USER_ID_FIELD, [user_id])
    else:
        user_reviews = reviews

    user_multi_item_ratings = {}

    for item_id, item_reviews_it in itertools.groupby(
            user_reviews, operator.itemgetter(Constants.ITEM_ID_FIELD)):

        item_reviews = list(item_reviews_it)
        averaged_multi_ratings = [0] * len(item_reviews[0]['multi_ratings'])
        for review in item_reviews:

            averaged_rating = 0.
            rating_index = 0
            for rating in review['multi_ratings']:
                averaged_multi_ratings[rating_index] +=\
                    rating / len(item_reviews)
                averaged_rating += rating
                rating_index += 1

        user_multi_item_ratings[item_id] = averaged_multi_ratings

    return user_multi_item_ratings
    def prepare(self):
        print('prepare: %s' % time.strftime("%Y/%m/%d-%H:%M:%S"))

        self.headers = build_headers(len(self.sense_groups))

        if Constants.USE_CONTEXT is True:
            for record in self.train_records:
                record.update(record[Constants.CONTEXT_WORDS_FIELD])

            for record in self.records_to_predict:
                record.update(record[Constants.CONTEXT_WORDS_FIELD])

            if Constants.FM_REVIEW_TYPE:
                self.train_records = ETLUtils.filter_records(
                    self.train_records, Constants.PREDICTED_CLASS_FIELD,
                    [Constants.FM_REVIEW_TYPE])

            # ETLUtils.drop_fields([Constants.TOPICS_FIELD], self.train_records)

        ETLUtils.keep_fields(self.headers, self.train_records)
        ETLUtils.keep_fields(self.headers, self.records_to_predict)

        ETLUtils.save_csv_file(
            self.csv_train_file, self.train_records, self.headers)
        ETLUtils.save_csv_file(
            self.csv_test_file, self.records_to_predict, self.headers)

        print('Exported CSV and JSON files: %s'
              % time.strftime("%Y/%m/%d-%H:%M:%S"))

        csv_files = [
            self.csv_train_file,
            self.csv_test_file
        ]

        print('num_cols', len(self.headers))

        libfm_converter.csv_to_libfm(
            csv_files, 0, [1, 2], [], ',', has_header=True,
            suffix='.libfm')

        print('Exported LibFM files: %s' % time.strftime("%Y/%m/%d-%H:%M:%S"))
Beispiel #33
0
def get_criteria_weights(reviews, user_id, apply_filter=True):
    """
    Obtains the weights for each of the criterion of the given user

    :param reviews: a list of all the available reviews
    :param user_id: the ID of the user
    :return: a list with the weights for each of the criterion of the given user
    """
    if apply_filter:
        user_reviews = ETLUtils.filter_records(reviews, 'user_id', [user_id])
    else:
        user_reviews = reviews

    ratings_matrix, overall_ratings_list = create_ratings_matrix(user_reviews)

    overall_ratings_matrix = numpy.vstack(
        [overall_ratings_list, numpy.ones(len(overall_ratings_list))]).T
    m, c = numpy.linalg.lstsq(overall_ratings_matrix, ratings_matrix)[0]

    return m
Beispiel #34
0
def get_user_item_multi_ratings(reviews, user_id, apply_filter=False):
    """
    Returns a dictionary that contains the items that the given user has rated,
    where the key of the dictionary is the ID of the item and the value is the
    rating that user_id has given to that item. This function returns the
    multi-criteria ratings the user has made.

    :param reviews: a list of reviews
    :param user_id: the ID of the user
    :param apply_filter: a boolean that indicates if the reviews have to be
    filtered by user_id or not. In other word this boolean indicates if the list
    contains reviews from several users or not. If it does contains reviews from
    other users, those have to be removed
    :return: a dictionary with the items that the given user has rated
    """

    if apply_filter:
        user_reviews = ETLUtils.filter_records(reviews, 'user_id', [user_id])
    else:
        user_reviews = reviews

    user_multi_item_ratings = {}

    for item_id, item_reviews_it in itertools.groupby(
            user_reviews, operator.itemgetter('offering_id')):

        item_reviews = list(item_reviews_it)
        averaged_multi_ratings = [0] * len(item_reviews[0]['multi_ratings'])
        for review in item_reviews:

            averaged_rating = 0.
            rating_index = 0
            for rating in review['multi_ratings']:
                averaged_multi_ratings[rating_index] +=\
                    rating / len(item_reviews)
                averaged_rating += rating
                rating_index += 1

        user_multi_item_ratings[item_id] = averaged_multi_ratings

    return user_multi_item_ratings
Beispiel #35
0
def get_criteria_weights(reviews, user_id, apply_filter=True):
    """
    Obtains the weights for each of the criterion of the given user

    :param reviews: a list of all the available reviews
    :param user_id: the ID of the user
    :return: a list with the weights for each of the criterion of the given user
    """
    if apply_filter:
        user_reviews = ETLUtils.filter_records(reviews, 'user_id', [user_id])
    else:
        user_reviews = reviews

    ratings_matrix, overall_ratings_list = create_ratings_matrix(user_reviews)

    overall_ratings_matrix = numpy.vstack(
        [overall_ratings_list,
         numpy.ones(len(overall_ratings_list))]).T
    m, c = numpy.linalg.lstsq(overall_ratings_matrix, ratings_matrix)[0]

    return m
Beispiel #36
0
def get_user_item_contexts(records,
                           lda_model,
                           user_id,
                           apply_filter=False,
                           minimum_probability=None):

    if apply_filter:
        user_records = ETLUtils.filter_records(records, 'user_id', [user_id])
    else:
        user_records = records

    if not user_records:
        return {}

    items_reviews = {}

    for record in user_records:
        review_text = record['text']
        context = get_topic_distribution(review_text, lda_model,
                                         minimum_probability)
        items_reviews[record['offering_id']] = context

    return items_reviews
Beispiel #37
0
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['0UZ31UTcOLRKuqPqPe-VBA'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['aRkYtXfmEKYG-eTDf_qUsw'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['-sC66z4SO3tR7nFCjfQwuQ'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['EWMwV5V9BxNs_U6nNVMeqw'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['L9UYbtAUOcfTgZFimehlXw'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['uFJwKlHL6HyHSJmORO8-5w'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['WS1z1OAR0tRl4FsjdTGUFQ'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['FURgKkRFtMK5yKbjYZVVwA'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['Gq092IH6eZqhAXwtXcwc6A'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['R8VwdLyvsp9iybNqRvm94g'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['uKSX1n1RoAzGq4bV8GPHVg'])

# business_records = ETLUtils.filter_records(my_records, 'business_id', ['hW0Ne_HTHEAgGF1rAdmR-g'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['VVeogjZya58oiTxK7qUjAQ'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['JokKtdXU7zXHcr20Lrk29A'])
business_records = ETLUtils.filter_records(my_records, 'business_id', ['EWMwV5V9BxNs_U6nNVMeqw'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['V1nEpIRmEa1768oj_tuxeQ'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['SDwYQ6eSu1htn8vHWv128g'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['WNy1uzcmm_UHmTyR--o5IA'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['ntN85eu27C04nwyPa8IHtw'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['-sC66z4SO3tR7nFCjfQwuQ'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['QnAzW6KMSciUcuJ20oI3Bw'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['uKSX1n1RoAzGq4bV8GPHVg'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['YKOvlBNkF4KpUP9q7x862w'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['aRkYtXfmEKYG-eTDf_qUsw'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['pwpl-rxwNRQdgqFz_-qMPg'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['3oZcTGb_oDHGwZFiP-7kxQ'])

my_tips = [my_record['text'] for my_record in business_records]
# TipPosTagger.process_tips(my_tips[:1000])
Beispiel #38
0
    def prepare_records_for_libfm(self):
        print('prepare_records_for_libfm: %s' %
              time.strftime("%Y/%m/%d-%H:%M:%S"))

        self.headers = build_headers(self.context_rich_topics)

        if Constants.USE_CONTEXT is True:

            if Constants.REVIEW_TYPE == Constants.SPECIFIC or \
                            Constants.REVIEW_TYPE == Constants.GENERIC:
                self.train_records = ETLUtils.filter_records(
                    self.train_records, Constants.PREDICTED_CLASS_FIELD,
                    [Constants.REVIEW_TYPE])

        with open(self.csv_train_file, 'w') as out_file:
            writer = csv.writer(out_file)

            # Write header
            writer.writerow(self.headers)

            for record in self.train_records:
                row = []
                for header in basic_headers:
                    row.append(record[header])

                if Constants.USE_CONTEXT is True:
                    for topic in self.context_rich_topics:
                        context_topics = record[Constants.CONTEXT_TOPICS_FIELD]
                        # print('context_topics', context_topics)
                        row.append(context_topics['topic' + str(topic[0])])

                writer.writerow(row)

        self.train_records = None
        gc.collect()

        with open(self.csv_test_file, 'w') as out_file:
            writer = csv.writer(out_file)

            # Write header
            writer.writerow(self.headers)

            for record in self.records_to_predict:
                row = []
                for header in basic_headers:
                    row.append(record[header])

                if Constants.USE_CONTEXT is True:
                    for topic in self.context_rich_topics:
                        important_record = record[Constants.REVIEW_ID_FIELD]
                        context_topics =\
                            self.context_topics_map[important_record]
                        row.append(context_topics['topic' + str(topic[0])])

                writer.writerow(row)

        # self.records_to_predict = None
        self.context_topics_map = None
        self.context_rich_topics = None
        gc.collect()

        print('Exported CSV and JSON files: %s' %
              time.strftime("%Y/%m/%d-%H:%M:%S"))

        csv_files = [self.csv_train_file, self.csv_test_file]

        print('num_cols', len(self.headers))

        libfm_converter.csv_to_libfm(csv_files,
                                     0, [1, 2], [],
                                     ',',
                                     has_header=True,
                                     suffix='.libfm')

        print('Exported LibFM files: %s' % time.strftime("%Y/%m/%d-%H:%M:%S"))
Beispiel #39
0
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['0UZ31UTcOLRKuqPqPe-VBA'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['aRkYtXfmEKYG-eTDf_qUsw'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['-sC66z4SO3tR7nFCjfQwuQ'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['EWMwV5V9BxNs_U6nNVMeqw'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['L9UYbtAUOcfTgZFimehlXw'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['uFJwKlHL6HyHSJmORO8-5w'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['WS1z1OAR0tRl4FsjdTGUFQ'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['FURgKkRFtMK5yKbjYZVVwA'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['Gq092IH6eZqhAXwtXcwc6A'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['R8VwdLyvsp9iybNqRvm94g'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['uKSX1n1RoAzGq4bV8GPHVg'])

# business_records = ETLUtils.filter_records(my_records, 'business_id', ['hW0Ne_HTHEAgGF1rAdmR-g'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['VVeogjZya58oiTxK7qUjAQ'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['JokKtdXU7zXHcr20Lrk29A'])
business_records = ETLUtils.filter_records(my_records, 'business_id',
                                           ['EWMwV5V9BxNs_U6nNVMeqw'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['V1nEpIRmEa1768oj_tuxeQ'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['SDwYQ6eSu1htn8vHWv128g'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['WNy1uzcmm_UHmTyR--o5IA'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['ntN85eu27C04nwyPa8IHtw'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['-sC66z4SO3tR7nFCjfQwuQ'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['QnAzW6KMSciUcuJ20oI3Bw'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['uKSX1n1RoAzGq4bV8GPHVg'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['YKOvlBNkF4KpUP9q7x862w'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['aRkYtXfmEKYG-eTDf_qUsw'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['pwpl-rxwNRQdgqFz_-qMPg'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['3oZcTGb_oDHGwZFiP-7kxQ'])

my_tips = [my_record['text'] for my_record in business_records]
# TipPosTagger.process_tips(my_tips[:1000])
    def prepare_records_for_libfm(self):
        print('prepare_records_for_libfm: %s' %
              time.strftime("%Y/%m/%d-%H:%M:%S"))

        self.headers = build_headers(self.context_rich_topics)

        if Constants.REVIEW_TYPE == Constants.SPECIFIC or \
                Constants.REVIEW_TYPE == Constants.GENERIC:
            self.train_records = ETLUtils.filter_records(
                self.train_records, Constants.PREDICTED_CLASS_FIELD,
                [Constants.REVIEW_TYPE])

        with open(self.csv_train_file, 'w') as out_file:
            writer = csv.writer(out_file)

            # Write header
            writer.writerow(self.headers)

            for record in self.train_records:
                row = []
                for header in basic_headers:
                    row.append(record[header])

                if Constants.USE_CONTEXT is True:
                    for topic in self.context_rich_topics:
                        context_topics = record[Constants.CONTEXT_TOPICS_FIELD]
                        # print('context_topics', context_topics)
                        row.append(context_topics['topic' + str(topic[0])])

                writer.writerow(row)

        self.train_records = None
        gc.collect()

        with open(self.csv_test_file, 'w') as out_file:
            writer = csv.writer(out_file)

            # Write header
            writer.writerow(self.headers)

            for record in self.records_to_predict:
                row = []
                for header in basic_headers:
                    row.append(record[header])

                if Constants.USE_CONTEXT is True:
                    for topic in self.context_rich_topics:
                        important_record = record[Constants.REVIEW_ID_FIELD]
                        context_topics =\
                            self.context_topics_map[important_record]
                        row.append(context_topics['topic' + str(topic[0])])

                writer.writerow(row)

        # self.records_to_predict = None
        self.context_topics_map = None
        self.context_rich_topics = None
        gc.collect()

        print('Exported CSV and JSON files: %s'
              % time.strftime("%Y/%m/%d-%H:%M:%S"))

        csv_files = [
            self.csv_train_file,
            self.csv_test_file
        ]

        print('num_cols', len(self.headers))

        libfm_converter.csv_to_libfm(
            csv_files, 0, [1, 2], [], ',', has_header=True,
            suffix='.libfm')

        print('Exported LibFM files: %s' % time.strftime("%Y/%m/%d-%H:%M:%S"))