コード例 #1
0
ファイル: reviews_preprocessor.py プロジェクト: ptzagk/yelp
    def full_cycle(self):
        Constants.print_properties()
        print('%s: full cycle' % time.strftime("%Y/%m/%d-%H:%M:%S"))
        utilities.plant_seeds()

        if self.use_cache and \
                os.path.exists(Constants.PROCESSED_RECORDS_FILE):
            print('Records have already been processed')
            self.records = \
                ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)
        else:
            self.load_records()

            if 'yelp' in Constants.ITEM_TYPE:
                self.transform_yelp_records()
            elif 'fourcity' in Constants.ITEM_TYPE:
                self.transform_fourcity_records()

            self.add_integer_ids()
            self.clean_reviews()
            self.remove_duplicate_reviews()
            self.tag_reviews_language()
            self.remove_foreign_reviews()
            self.lemmatize_records()
            self.remove_users_with_low_reviews()
            self.remove_items_with_low_reviews()
            self.count_frequencies()
            self.shuffle_records()
            print('total_records: %d' % len(self.records))
            self.classify_reviews()
            self.build_bag_of_words()
            self.tag_contextual_reviews()
            # self.load_full_records()
            self.build_dictionary()
            self.build_corpus()
            self.label_review_targets()
            self.export_records()

        self.count_specific_generic_ratio()
        # self.export_to_triplet()

        rda = ReviewsDatasetAnalyzer(self.records)
        print('density: %f' % rda.calculate_density_approx())
        print('sparsity: %f' % rda.calculate_sparsity_approx())
        print('total_records: %d' % len(self.records))
        user_ids = \
            extractor.get_groupby_list(self.records, Constants.USER_ID_FIELD)
        item_ids = \
            extractor.get_groupby_list(self.records, Constants.ITEM_ID_FIELD)
        print('total users', len(user_ids))
        print('total items', len(item_ids))

        if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS:
            self.separate_recsys_topic_model_records()
コード例 #2
0
    def preprocess(self):

        self.load_records()

        if 'yelp' in Constants.ITEM_TYPE:
            self.transform_yelp_records()
        elif 'fourcity' in Constants.ITEM_TYPE:
            self.transform_fourcity_records()

        self.add_integer_ids()
        self.clean_reviews()
        self.remove_duplicate_reviews()
        self.tag_reviews_language()
        self.remove_foreign_reviews()
        self.remove_reviews_from_classifier_training_set()
        self.lemmatize_records()
        self.remove_users_with_low_reviews()
        self.remove_items_with_low_reviews()
        self.count_frequencies()
        self.shuffle_records()
        print('total_records: %d' % len(self.records))
        self.classify_reviews()
        self.build_bag_of_words()
        self.tag_contextual_reviews()
        # self.load_full_records()
        self.build_dictionary()
        self.build_corpus()
        self.label_review_targets()
        self.export_records()

        self.count_specific_generic_ratio()
        self.export_to_triplet()

        rda = ReviewsDatasetAnalyzer(self.records)
        print('density: %f' % rda.calculate_density_approx())
        print('sparsity: %f' % rda.calculate_sparsity_approx())
        print('total_records: %d' % len(self.records))
        user_ids = \
            extractor.get_groupby_list(self.records, Constants.USER_ID_FIELD)
        item_ids = \
            extractor.get_groupby_list(self.records, Constants.ITEM_ID_FIELD)
        print('total users', len(user_ids))
        print('total items', len(item_ids))