def analyze_data(self):
        # Log the time and print out a notification to the user
        default_logger.log_time()
        self.log("Starting analysis...")
        self.log("Reading json file")
        # Read the json file
        data = pd.read_json("test.json")

        # Print out the shape of the data object
        self.log("{0} Reviews (Rows), {1} Columns".format(
            data.shape[0], data.shape[1]))

        # Print out the first few entries of the dataset
        # print(data.head())

        group_by_reviewer = data.groupby(['reviewerID'])['reviewerID'].count()
        # print(group_by_reviewer.head())
        print("# Unique Reviewers: {0}".format(len(group_by_reviewer)))
        # print("# Unique Reviewers: {0}".format(len(data['reviewerID'].unique()))) # An alternative way for the above line
        print("# Unique Items: {0}".format(len(data['asin'].unique())))
        print("Average number of reviews: {0}".format(
            group_by_reviewer.mean()))
        print("Average review rating: {0}".format(data['overall'].mean()))

        # Users are connected if they rate the same item
        # graph(data) # WARNING THIS IS VERY SLOW
        self.log("Finished analysis")
Beispiel #2
0
    def __init__(self, raw_folder_path, cache_folder_path, csv_file_name):
        default_logger.log_time()
        self.raw_folder_path = raw_folder_path
        self.cache_folder_path = cache_folder_path
        self.target_csv_file = csv_file_name
        self.log("Initialized preprocessor")

        self.user_reviews_pkl_file_path = self.cache_folder_path + self.target_csv_file + '.pkl'
        self.product_integer_encoding_file_path = self.cache_folder_path + self.target_csv_file + '_item_integer_encoding.pkl'
        self.user_integer_encoding_file_path = self.cache_folder_path + self.target_csv_file + '_user_integer_encoding.pkl'
    def __init__(self,
                 dataset,
                 model_options=AllOptions.ModelOptions,
                 data_options=AllOptions.DataOptions):
        default_logger.log_time()

        NN.__init__(self, dataset, model_options, data_options)

        # self.num_items = dataset.get_num_items()
        # self.num_users = dataset.get_num_users()
        self.num_users = 24303
        self.num_items = 10672

        # Get the data
        self.log("Initialized UserItemRecommender")
Beispiel #4
0
    def train(self):
        '''Train the model
        '''

        # Check if a model exists
        if self.model is None:
            self.log("ERROR: Please create a model using build_model() first.")
            return

        # A model exists. Begin training
        default_logger.log_time()
        self.log("Beginning training...")

        callback_list = []

        if self.model_options.checkpoint_enabled:
            checkpoints_callback = keras.callbacks.ModelCheckpoint(self.data_options.checkpoints_folder_path + self.model_options.checkpoint_string,
                                                                   monitor=self.model_options.checkpoint_monitor,
                                                                   verbose=self.model_options.checkpoint_verbose,
                                                                   save_weights_only=True,
                                                                   period=self.model_options.checkpoint_period)
            callback_list.append(checkpoints_callback)

        if self.model_options.early_stopping_enabled:
            early_stopping_callback = keras.callbacks.EarlyStopping(monitor=self.model_options.early_stopping_monitor,
                                                                    min_delta=self.model_options.early_stopping_min_delta,
                                                                    patience=self.model_options.early_stopping_patience,
                                                                    verbose=self.model_options.early_stopping_verbose)
            callback_list.append(early_stopping_callback)

        training_history = self.model.fit(self.partial_train_data,
                                          self.partial_train_labels,
                                          epochs=self.model_options.num_epochs,
                                          batch_size=self.model_options.training_batch_size,
                                          validation_data=(
                                              self.validation_data, self.validation_labels),
                                          verbose=1,
                                          callbacks=callback_list)

        self.log("Finished training at:")
        default_logger.log_time()
        self.log("You can override this training function in a subclass")
        return training_history
Beispiel #5
0
    def __init__(self, dataset, model_options=AllOptions.ModelOptions, data_options=AllOptions.DataOptions):
        default_logger.log_time()

        # Make sure you are running an adequate version of Tensorflow
        self.log("Tensorflow version: {}".format(tf.__version__))

        # Set variables
        self.review_dataset = dataset
        self.data_options = data_options
        self.model_options = model_options
        self.model = None

        # Get the data
        (self.train_data, self.train_labels), (self.test_data,
                                               self.test_labels) = self.review_dataset.get_training_and_testing()
        self.log("Retrieved training and testing data from the dataset")

        self.create_validation_from_training(
            model_options.num_validation_samples)

        self.log("Initialized NN")
    def create_csv(self, target_file_name):
        '''Creates a csv based on a 'target_file_name' in json.
    '''
        self.update_paths(target_file_name)

        # Log the time and some notificiations
        default_logger.log_time()
        self.log("Reading json file")
        self.log("Stripping unnecessary data and converting to proper json...")

        # Attempt to filter the data
        # self.log("From: {}\t To: {}".format(self.target_json_path, self.stripped_json_path))
        self.filter_data(self.target_json_path, self.stripped_json_path)

        # Log the time it took and send a notification
        default_logger.log_time()
        self.log(
            "Finished stripping unnecessary data. Now converting json to csv..."
        )

        # Convert the json to csv
        self.convert_json_to_csv(self.stripped_json_path, self.target_csv_path)
        self.log("Finished converting json file at:")
        default_logger.log_time()
        # Build and train the model
        user_item_recommender.build_model()
        training_history = user_item_recommender.train()

        # Evaluate the trained model
        user_item_recommender.evaluate_model()

        # Save the model
        saved_file_name = user_item_recommender.save_model(training_history)

        # Graph the results from training
        user_item_recommender.generate_graphs(
            training_history,
            AllOptions.DataOptions.graphs_folder_path + saved_file_name)

    else:
        print("Loading model...")
        # Load model from a checkpoint
        # user_item_recommender.load_model_from_checkpoint('weights_020_0.73loss.hdf5')
        user_item_recommender.load_model(
            '/2018-12-20/13h-44m-53s_user_item_NN_model_[1.851val_loss]_[1.066val_mean_absolute_error]_[0.341loss]_[0.424mean_absolute_error].h5'
        )
        user_item_recommender.evaluate_model()
        default_logger.log_time()

    (user_embeddings, item_embeddings) = user_item_recommender.get_embeddings()
    default_logger.log_time()
    user_item_predictions = user_item_recommender.recommend_items_for_user(546)
    default_logger.log_time()
Beispiel #8
0
    def get_user_reviews(self, reconstruct=False):
        '''Get user reviews
        '''
        user_reviews = None

        if reconstruct == False and os.path.exists(
                self.user_reviews_pkl_file_path):
            with open(self.user_reviews_pkl_file_path,
                      'rb') as user_reviews_file:
                self.log("Loaded stored user reviews data at: {}".format(
                    self.user_reviews_pkl_file_path))
                user_reviews = pickle.load(user_reviews_file)
                user_encoding_df = self.get_item_encoding(
                    reconstruct=reconstruct)
                product_encoding_df = self.get_user_encoding(
                    reconstruct=reconstruct)
        else:
            # Log the time of this function call (reset checkpoint too)
            default_logger.set_checkpoint()
            default_logger.log_time()
            user_reviews = pd.read_csv(self.raw_folder_path +
                                       self.target_csv_file + '.csv')

            # Get products
            product_ids = array(user_reviews.asin)
            self.log("First 10 products IDs: {}".format(product_ids[:10]))

            # Integer encode products
            product_label_encoder = LabelEncoder()
            integer_encoded_products = product_label_encoder.fit_transform(
                product_ids)
            self.log("First 10 ENCODED products IDs: {}".format(
                integer_encoded_products[:10]))

            # Reorganize column labels for products
            user_reviews['original_asin'] = user_reviews['asin']
            user_reviews['asin'] = integer_encoded_products.astype(int)

            # Save the encoding for products
            self.log("Saving product integer encoding...")
            product_encoding_df = pd.DataFrame(
                data={'original_asin': product_label_encoder.classes_})
            with open(self.product_integer_encoding_file_path,
                      'wb') as product_encoding_file:
                pickle.dump(product_encoding_df, product_encoding_file,
                            pickle.HIGHEST_PROTOCOL)

            self.log("_" * 100)

            # Get reviewers
            user_ids = array(user_reviews.reviewerID)
            self.log("First 10 reviewer IDs: {}".format(user_ids[:10]))

            # Integer encode reviewers
            reviewer_label_encoder = LabelEncoder()
            integer_encoded_reviewers = reviewer_label_encoder.fit_transform(
                user_ids)
            self.log("First 10 ENCODED user IDs: {}".format(
                integer_encoded_reviewers[:10]))

            # Reorganize column labls for users
            user_reviews['original_reviewerID'] = user_reviews['reviewerID']
            user_reviews['reviewerID'] = integer_encoded_reviewers.astype(int)

            # Save the encoding for users
            self.log("Saving user integer encoding...")
            user_encoding_df = pd.DataFrame(
                data={'original_reviewerID': reviewer_label_encoder.classes_})
            with open(self.user_integer_encoding_file_path,
                      'wb') as user_encoding_file:
                pickle.dump(user_encoding_df, user_encoding_file,
                            pickle.HIGHEST_PROTOCOL)

            # Save the user reviews dataframe to a file
            self.log("Saving preprocessed user reviews data...")
            with open(self.user_reviews_pkl_file_path,
                      'wb') as user_reviews_file:
                pickle.dump(user_reviews, user_reviews_file,
                            pickle.HIGHEST_PROTOCOL)
            self.log("Saved user reviews data at: {}".format(
                self.user_reviews_pkl_file_path))

            # Log the time this function call took
            default_logger.log_time()

        return user_reviews, user_encoding_df, product_encoding_df