Exemple #1
0
def dataset_bucket_analysis_by_field(field):
    # Set the dataset
    hotel_dataset_properties = {Constants.BUSINESS_TYPE_FIELD: 'fourcity_hotel'}
    Constants.update_properties(hotel_dataset_properties)

    records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)

    print('Loaded %d records' % len(records))

    user_frequency_map = {}

    for record in records:

        user_id = record[field]
        if user_id not in user_frequency_map:
            user_frequency_map[user_id] = 0
        user_frequency_map[user_id] += 1

    print('There is a total of %d %ss' % (len(user_frequency_map), field))
    sorted_x = sorted(user_frequency_map.items(), key=operator.itemgetter(1), reverse=True)
    print(sorted_x[0])
    print(sorted_x[1])
    print(sorted_x[2])
    # print(user_frequency_map)

    # Number of reviews per user
    rda = ReviewsDatasetAnalyzer(records)
    users_summary = rda.summarize_reviews_by_field(field)
    print('Average number of reviews per %s: %f' % (field,
          float(rda.num_reviews) / rda.num_users))
    users_summary.plot(kind='line', rot=0)

    pandas.set_option('display.max_rows', len(users_summary))
    print(users_summary)
    pandas.reset_option('display.max_rows')
Exemple #2
0
    def full_cycle(self):
        Constants.print_properties()
        print('%s: full cycle' % time.strftime("%Y/%m/%d-%H:%M:%S"))
        utilities.plant_seeds()

        if self.use_cache and \
                os.path.exists(Constants.PROCESSED_RECORDS_FILE):
            print('Records have already been processed')
            self.records = \
                ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)
        else:
            self.load_records()

            if 'yelp' in Constants.ITEM_TYPE:
                self.transform_yelp_records()
            elif 'fourcity' in Constants.ITEM_TYPE:
                self.transform_fourcity_records()

            self.add_integer_ids()
            self.clean_reviews()
            self.remove_duplicate_reviews()
            self.tag_reviews_language()
            self.remove_foreign_reviews()
            self.lemmatize_records()
            self.remove_users_with_low_reviews()
            self.remove_items_with_low_reviews()
            self.count_frequencies()
            self.shuffle_records()
            print('total_records: %d' % len(self.records))
            self.classify_reviews()
            self.build_bag_of_words()
            self.tag_contextual_reviews()
            # self.load_full_records()
            self.build_dictionary()
            self.build_corpus()
            self.label_review_targets()
            self.export_records()

        self.count_specific_generic_ratio()
        # self.export_to_triplet()

        rda = ReviewsDatasetAnalyzer(self.records)
        print('density: %f' % rda.calculate_density_approx())
        print('sparsity: %f' % rda.calculate_sparsity_approx())
        print('total_records: %d' % len(self.records))
        user_ids = \
            extractor.get_groupby_list(self.records, Constants.USER_ID_FIELD)
        item_ids = \
            extractor.get_groupby_list(self.records, Constants.ITEM_ID_FIELD)
        print('total users', len(user_ids))
        print('total items', len(item_ids))

        if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS:
            self.separate_recsys_topic_model_records()
Exemple #3
0
    def preprocess(self):

        self.load_records()

        if 'yelp' in Constants.ITEM_TYPE:
            self.transform_yelp_records()
        elif 'fourcity' in Constants.ITEM_TYPE:
            self.transform_fourcity_records()

        self.add_integer_ids()
        self.clean_reviews()
        self.remove_duplicate_reviews()
        self.tag_reviews_language()
        self.remove_foreign_reviews()
        self.remove_reviews_from_classifier_training_set()
        self.lemmatize_records()
        self.remove_users_with_low_reviews()
        self.remove_items_with_low_reviews()
        self.count_frequencies()
        self.shuffle_records()
        print('total_records: %d' % len(self.records))
        self.classify_reviews()
        self.build_bag_of_words()
        self.tag_contextual_reviews()
        # self.load_full_records()
        self.build_dictionary()
        self.build_corpus()
        self.label_review_targets()
        self.export_records()

        self.count_specific_generic_ratio()
        self.export_to_triplet()

        rda = ReviewsDatasetAnalyzer(self.records)
        print('density: %f' % rda.calculate_density_approx())
        print('sparsity: %f' % rda.calculate_sparsity_approx())
        print('total_records: %d' % len(self.records))
        user_ids = \
            extractor.get_groupby_list(self.records, Constants.USER_ID_FIELD)
        item_ids = \
            extractor.get_groupby_list(self.records, Constants.ITEM_ID_FIELD)
        print('total users', len(user_ids))
        print('total items', len(item_ids))
    def test_count_items_in_common(self):

        expected_value = {1: 23, 2: 4, 3: 1}
        rda = ReviewsDatasetAnalyzer(reviews)
        actual_value = rda.count_items_in_common()
        self.assertEqual(expected_value, actual_value)

        expected_value = {0: 1}
        rda = ReviewsDatasetAnalyzer(reviews_small)
        actual_value = rda.count_items_in_common()
        self.assertEqual(expected_value, actual_value)
    def test_calculate_sparsity_approx(self):

        expected_value = 0.
        rda = ReviewsDatasetAnalyzer(reviews_matrix_3)
        actual_value = rda.calculate_sparsity()
        self.assertEqual(expected_value, actual_value)

        expected_value = 1 - 6./9
        rda = ReviewsDatasetAnalyzer(reviews_matrix_3_1)
        actual_value = rda.calculate_sparsity()
        self.assertEqual(expected_value, actual_value)

        expected_value = 1 - 3./9
        rda = ReviewsDatasetAnalyzer(reviews_matrix_3_2)
        actual_value = rda.calculate_sparsity()
        self.assertEqual(expected_value, actual_value)

        expected_value = 1 - 15./24
        rda = ReviewsDatasetAnalyzer(reviews)
        actual_value = rda.calculate_sparsity_approx()
        self.assertEqual(expected_value, actual_value)
Exemple #6
0
    def generate_report(reviews, dataset_name, file_name, load_reviews_code):

        nb = nbf.new_notebook()
        title = '# ' + dataset_name + ' Dataset Analysis'
        title_cell = nbf.new_text_cell(u'markdown', title)

        rda = ReviewsDatasetAnalyzer(reviews)
        num_reviews = len(rda.reviews)
        num_users = len(rda.user_ids)
        num_items = len(rda.item_ids)
        user_avg_reviews = float(num_reviews) / num_users
        item_avg_reviews = float(num_reviews) / num_items
        sparsity = rda.calculate_sparsity_approx()

        fact_sheet_text =\
            '## Fact Sheet\n' +\
            'The ' + dataset_name + ' contains:\n' +\
            '* ' + str(num_reviews) + ' reviews\n' +\
            '* Made by ' + str(num_users) + ' users\n' +\
            '* About ' + str(num_items) + ' items\n' +\
            '* It has an approximated sparsity of ' + str(sparsity) + '\n' +\
            '\nNow we are going to analyze the number of reviews per user and ' \
            'per item'
        fact_sheet_cell = nbf.new_text_cell(u'markdown', fact_sheet_text)

        reviews_analysis_code =\
            'import sys\n' +\
            'sys.path.append(\'/Users/fpena/UCC/Thesis/projects/yelp/source/python\')\n' +\
            'from etl import ETLUtils\n\n' +\
            'from etl.reviews_dataset_analyzer import ReviewsDatasetAnalyzer\n' +\
            '\n# Load reviews\n' + load_reviews_code + '\n' +\
            'rda = ReviewsDatasetAnalyzer(reviews)\n'
        reviews_analysis_cell = nbf.new_code_cell(reviews_analysis_code)

        user_analysis_text =\
            '## Users Reviews Analysis\n' +\
            '* The average number of reviews per user is ' + str(user_avg_reviews) + '\n' +\
            '* The minimum number of reviews a user has is ' + str(min(rda.users_count)) + '\n' +\
            '* The maximum number of reviews a user has is ' + str(max(rda.users_count))
        user_analysis_cell = nbf.new_text_cell(u'markdown', user_analysis_text)

        counts_per_user_code =\
            '# Number of reviews per user\n' +\
            'users_summary = rda.summarize_reviews_by_field(\'user_id\')\n' +\
            'print(\'Average number of reviews per user\', float(rda.num_reviews)/rda.num_users)\n' +\
            'users_summary.plot(kind=\'line\', rot=0)'
        counts_per_user_cell = nbf.new_code_cell(counts_per_user_code)

        item_analysis_text =\
            '## Items Reviews Analysis\n' +\
            '* The average number of reviews per item is ' + str(item_avg_reviews) + '\n' +\
            '* The minimum number of reviews an item has is ' + str(min(rda.items_count)) + '\n' +\
            '* The maximum number of reviews an item has is ' + str(max(rda.items_count))
        item_analysis_cell = nbf.new_text_cell(u'markdown', item_analysis_text)

        counts_per_item_code =\
            '# Number of reviews per item\n' +\
            'items_summary = rda.summarize_reviews_by_field(\'offering_id\')\n' +\
            'print(\'Average number of reviews per item\', float(rda.num_reviews)/rda.num_items)\n' +\
            'items_summary.plot(kind=\'line\', rot=0)'
        counts_per_item_cell = nbf.new_code_cell(counts_per_item_code)

        common_items_text =\
            '## Number of items 2 users have in common\n' +\
            'In this section we are going to count the number of items two ' \
            'users have in common'
        common_items_text_cell = nbf.new_text_cell(u'markdown', common_items_text)

        common_items_code =\
            '# Number of items 2 users have in common\n' +\
            'common_item_counts = rda.count_items_in_common()\n' +\
            'plt.plot(common_item_counts.keys(), common_item_counts.values())\n'
        common_items_code_cell = nbf.new_code_cell(common_items_code)

        common_items_box_code =\
            'from pylab import boxplot\n' +\
            'my_data = [key for key, value in common_item_counts.iteritems() for i in xrange(value)]\n' +\
            'mean_common_items = float(sum(my_data))/len(my_data)\n' +\
            'print(\'Average number of common items between two users:\', mean_common_items)\n' +\
            'boxplot(my_data)'
        common_items_box_cell = nbf.new_code_cell(common_items_box_code)

        cells = []
        cells.append(title_cell)
        cells.append(fact_sheet_cell)
        cells.append(reviews_analysis_cell)
        cells.append(user_analysis_cell)
        cells.append(counts_per_user_cell)
        cells.append(item_analysis_cell)
        cells.append(counts_per_item_cell)
        cells.append(common_items_text_cell)
        cells.append(common_items_code_cell)
        cells.append(common_items_box_cell)
        nb['worksheets'].append(nbf.new_worksheet(cells=cells))

        with open(file_name, 'w') as f:
            nbf.write(nb, f, 'ipynb')