Exemple #1
0
    def predictVideoCategory(self, tree, model_count_title,
                             model_count_description, count_vectorizer_title,
                             count_vectorizer_description):
        """
            Predict the category of the video based on 'title' and 'description' using the model
        """
        try:
            # Transform the text features into sparse matrix
            vect_title = count_vectorizer_title.transform([self.title])
            vect_description = count_vectorizer_description.transform(
                [self.description])

            # Perform prediction for 'title' and 'description'
            title_based_prediction = model_count_title.predict(vect_title)
            description_based_prediction = model_count_description.predict(
                vect_description)

            # Merge 'title' and 'description' predictions and perform final prediction on these.
            video_df = pd.DataFrame({
                'title_prediction':
                title_based_prediction,
                'description_prediction':
                description_based_prediction
            })
            category_prediction = tree.predict(video_df)

            return category_prediction[0]
        except:
            raise VideoAnalysisException(
                " Error while predicting video catagory")
Exemple #2
0
def plotFeatureImportance(data,clf):
    '''
        Plot barchart showing numerical feature importance for predicting video categories
        Input : classifier
        Output : bar chart showing the importance of features according to the classifier
    '''
    try:
        print "Generating the Feature Importance bar chart...\n"
        time.sleep(3)

        # Plot
        predictor_var = ["viewCount", "likeCount", "dislikeCount", "favoriteCount","commentCount", "caption"]
        fig, ax = plt.subplots(figsize=(15, 15))
        width=0.7
        ax.bar(np.arange(len(clf.feature_importances_)), clf.feature_importances_, width, color='b')
        ax.set_xticks(np.arange(len(clf.feature_importances_)))
        ax.set_xticklabels(predictor_var,rotation=45)
        plt.title('Numerical Features Importance', fontsize=20)
        ax.set_ylabel('Normalized Entropy Importance')
        name = "../YoutubeData/feature_importance.pdf"
        plt.savefig(name)
        print "\nPlease close the Bar Chart when you want to move ahead..."
        plt.show()

        print "You can always retrieve the Feature Importance bar chart in YoutubeData folder.\n"
        time.sleep(3)
        return True
    except:
        raise VideoAnalysisException(" Error while ploating Feature Importance Graph ")
Exemple #3
0
def plotGraphicalCorrelationMatrix(data):
    '''
        Input : data
        Output : graphical correlation matrix
        Inspired from : https://stanford.edu/~mwaskom/software/seaborn/examples/many_pairwise_correlations.html
    '''
    try:
        print "\nGenerating the graphical correlation matrix...\n"
        time.sleep(3)

        corr = data.corr()
        f, ax = plt.subplots(figsize=(20, 20))
        # Generate a custom diverging colormap
        cmap = sns.diverging_palette(220, 10, as_cmap=True)
        # Draw the heatmap with the mask and correct aspect ratio
        sns.heatmap(corr, cmap=cmap,
                    square=True, xticklabels=False, yticklabels=False,
                    linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)
        plt.title('Correlation Matrix', fontsize=30)
        ax.set_ylabel('Features', fontsize=20)
        ax.set_xlabel('Features', fontsize=20)
        xticklabels = ['video_category_id','viewCount','likeCount','dislikeCount','favoriteCount','commentCount','dimension','definition','caption','licensedContent']
        ylabel = xticklabels[::-1]
        ax.set_xticklabels(xticklabels, rotation=45)
        ax.set_yticklabels(ylabel, rotation=0)
        name = "../YoutubeData/correlation_matrix.pdf"
        plt.savefig(name)
        print "\nPlease close the Bar Chart when you want to move ahead..."
        plt.show()

        print "You can always retrieve the graphical correlation matrix in YoutubeData folder.\n"
        time.sleep(3)
        return True
    except:
        raise VideoAnalysisException(" Error while Generating the graphical correlation matrix")
Exemple #4
0
    def visualizeData(self, data):
        """
		    Display general information on the data and show the 5 first rows
		"""
        try:
            print "\n The data we are using are metadata on YouTube videos..."
            time.sleep(3)
            print "\n The metadata we have on the videos are the following: "
            time.sleep(3)
            for col in data.columns:
                print "\n > " + col
                time.sleep(1.5)
            print "\n We use these data to build a model able to perform prediction on the category to which each video belong..."
            time.sleep(3)
            print "\n The possible categories to which a video can belong are the following (only one category per video): "
            time.sleep(3)
            print "\n > Autos & Vehicles, Comedy, Education, Entertainment, Film & Animation, Gaming, Howto & Style, Music, " \
                  "News & Politics, Nonprofits & Activism, People & Blogs, Pets & Animals, Science & Technology, Sports, Travel & Events "
            time.sleep(6)
            print "\n In order to do that, we have a data set of about 240.000 YouTube videos..."
            time.sleep(3)
            print "\n Now here is what the first five rows of our data set look like --> ..."
            time.sleep(3)
            print "\n (You can always go check the full file : YouTubeData/train_sample.csv ...)"
            time.sleep(3)
            print data.head()
            time.sleep(6)
            return True
        except:
            raise VideoAnalysisException(
                " Error while Performing Data Visualization ")
Exemple #5
0
    def binaryTree(self, data):
        '''
            Building a first exploratory model on non-text data to allow future basic data analytics such as feature importance plottting
            input : dataframe including the data
            output : binary tree trained on the data
        '''

        try:
            # Split Data Set
            target_var = "video_category_id"
            predictor_var = [
                "viewCount", "likeCount", "dislikeCount", "favoriteCount",
                "commentCount", "caption"
            ]
            X = data[predictor_var]
            Y = data[target_var]
            Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,
                                                            Y,
                                                            train_size=0.80,
                                                            random_state=10)

            # Build model
            decision_tree = DecisionTreeClassifier(criterion="entropy",
                                                   min_samples_split=2,
                                                   min_samples_leaf=128)
            decision_tree.fit(Xtrain, Ytrain)

            return decision_tree

        except:
            raise VideoAnalysisException(
                " Error while building the decision tree")
Exemple #6
0
 def printVideoCategories(self):
     """
         Print the list of features in the dataset
     """
     try:
         for key in self.Catagory_mapping:
             print key, "-->", self.Catagory_mapping[key]
     except:
         raise VideoAnalysisException(" Error while printing video catagories ")
Exemple #7
0
 def __init__(self, title, description):
     """
         Video Constructor: initialize all the video's features.
     """
     try:
         self.title = title
         self.description = description
     except:
         raise VideoAnalysisException(
             " Error while initializing Video class instance ")
Exemple #8
0
 def __init__(self):
     """
         Initialize the program prompting the instruction to the program
     """
     try:
         self.printProgramInfos()
         self.datamanager = DataManager()
         self.dataexplorer = DataExplorer()
     except:
         raise VideoAnalysisException(" Error while initializing FlowManager Instance ")
Exemple #9
0
 def printFeatures(self):
     """
         Print the list of the existing YouTube categories, in order to allow the user to chose the one on which
         he wants to perform analysis
     """
     try:
         for key in self.features:
             print key, "-->", self.features[key]
     except:
         raise VideoAnalysisException(" Error while printing different features of the data ")
Exemple #10
0
 def load_data(self):
     '''
         Loading data into pandas dataframe
     '''
     try:
         dataframe = pd.read_csv('../YouTubeData/train_sample.csv')
         df_reindex = dataframe.reindex(np.random.permutation(dataframe.index))
         return df_reindex
     except:
         raise VideoAnalysisException(" Error while loading the data ")
Exemple #11
0
 def generalAnalysis(self, data, clf):
     """
         Perform general analysis
     """
     try:
         dataplotter.plotFeatureImportance(data, clf)
         dataplotter.plotNumericalCorrelationMatrix(data)
         dataplotter.plotGraphicalCorrelationMatrix(data)
         return True
     except:
         raise VideoAnalysisException(" Error while performing general analysis ")
Exemple #12
0
def featuresBarPlot(barNames, barValues):
    """
        Display a bar plot with the barValues as the length of the bars and barNames as the name of the bars.
        Input: name of the bars, value of the bars
        Output: bar plot
    """
    try:
        plt.bar(range(0, len(barNames)), barValues)
        plt.xticks(range(0, len(barNames)), barNames, rotation='vertical')
        plt.show()
    except:
        raise VideoAnalysisException(
            " Error while ploating features Bar Plot ")
Exemple #13
0
def plotNumericalCorrelationMatrix(data):
    '''
        Input : data
        Output : numerical correlation matrix
    '''
    try:
        print "Displaying numerical correlation matrix...\n"
        time.sleep(3)
        corr = data.corr()
        print corr
        time.sleep(3)
        return True
    except:
        raise VideoAnalysisException(" Error while Displaying numerical correlation matrix ")
Exemple #14
0
 def individual_feature_analysis(self, data, chosenFeature):
     """
         Compute a group by on the chosenFeature and call featuresBarPlot to plot the result
     """
     try:
         print "chosen feature: ", self.features[chosenFeature]
         featuresMeans = data.groupby(['video_category_id'])[self.features[chosenFeature]].mean()
         featuresNames = [self.Catagory_mapping[x] for x in featuresMeans.index]
         name = "../YoutubeData/FeatureBarChart.pdf"
         plt.savefig(name)
         print "\nPlease close the Bar Chart when you want to move ahead..."
         dataplotter.featuresBarPlot(featuresNames, featuresMeans.values)
         print "You can always retrieve the Feature Importance bar chart in YoutubeData folder.\n"
         time.sleep(3)
         return True
     except:
         raise VideoAnalysisException(" Error while performing individual feature analysis ")
Exemple #15
0
 def clean_data(self, data):
     '''
         remove unwanted colums, convert catagorical data into numeric data
         dimension : 2d -> 1 , 3d -> 0
         definition :: hd -> 1 , sd -> 0
         caption : True -> 1 , False -> 0
         licensedContent : True -> 1 , False -> 0
     '''
     try:
         data['dimension'] = (data['dimension'] == '2d') * 1
         data['definition'] = (data['definition'] == 'hd') * 1
         data['caption'] = (data['caption']) * 1
         data['licensedContent'] = (data['licensedContent']) * 1
         # Replace the NaN values in 'description'
         data['description'].fillna('No description', inplace=True)
         return data
     except:
         raise VideoAnalysisException(" Error while cleaning the data ")
Exemple #16
0
    def individual_videocatagory_analysis(self, data, video_id):
        """
        Input : cleaned data set, video id
        Output : Analysis of data in form of matrices and numerical values for chosen Video Catagory
        This function takes the Video_catagory_id from the user, and analyze the data for that Video Catagory.
        """

        try:
            video_id = int(video_id)
            VideoCatagory = self.Catagory_mapping[video_id]
            dataframe = data
            count_features = [
                "viewCount", "likeCount", "dislikeCount", "favoriteCount",
                "commentCount"
            ]
            time.sleep(2)
            print "\n==============="
            print "Analysis for video catagory = ", VideoCatagory
            print "===============\n"
            VideoCatagoryData = dataframe[dataframe["video_category_id"] ==
                                          video_id]
            VideoCatagory_CountData = VideoCatagoryData[count_features]
            description = VideoCatagoryData.describe()
            correlation = VideoCatagory_CountData.corr(method='pearson',
                                                       min_periods=1)
            time.sleep(2)
            print "\ndescription of each feature - \n"
            time.sleep(3)
            print description
            time.sleep(3)
            print "\ncorrelation within count features - \n"
            time.sleep(3)
            print correlation
            time.sleep(3)

            print "\nAnalysis based on count for Binary features :------> \n"
            time.sleep(3)

            print "\nDimension analysis: 2d(1) , 3d(0)\n"
            time.sleep(2)
            dimensionCount = VideoCatagoryData.groupby(
                'dimension')['video_category_id'].count()
            print dimensionCount
            time.sleep(2)
            print "\nWe have ", dimensionCount[
                0], " 3D videos and ", dimensionCount[
                    1], " 2D videos for catagory ", VideoCatagory
            time.sleep(4)

            print "\ndefinition analysis: hd(1) , sd(0)\n"
            time.sleep(2)
            DefinitionCount = VideoCatagoryData.groupby(
                'definition')['video_category_id'].count()
            print DefinitionCount
            time.sleep(2)
            print "\nWe have ", DefinitionCount[
                0], " SD videos and ", DefinitionCount[
                    1], " HD videos for catagory ", VideoCatagory
            time.sleep(4)

            print "\ncaption : TRUE(1) , FALSE(0)\n"
            time.sleep(2)
            CaptionCount = VideoCatagoryData.groupby(
                'caption')['video_category_id'].count()
            print CaptionCount
            time.sleep(2)
            print "\nWe have ", CaptionCount[
                1], " videos with the caption and ", CaptionCount[
                    0], " videos without the caption for catagory ", VideoCatagory
            time.sleep(4)

            print "\ncaption analysis: TRUE(1) , FALSE(0)\n"
            time.sleep(2)
            licenseCount = VideoCatagoryData.groupby(
                'licensedContent')['video_category_id'].count()
            print licenseCount
            time.sleep(2)
            print "\nWe have ", licenseCount[
                1], " videos with the License and ", CaptionCount[
                    0], " videos without the License for catagory ", VideoCatagory
            time.sleep(4)
            return True

        except:
            raise VideoAnalysisException(
                " Error while analyzing data for individual video catagory ")
Exemple #17
0
    def generatePredictingModel(data):
        """
            Build the prediction model (based on the data set we have) in order to be able to predict the category
            of a new video from the user input
            Return a classifier able to predict the category of a video based on its title and description.
        """
        try:
            # Intitialize a timer to compute the time to build the model
            start = time.time()

            # Split into train-test data set
            X = data[[
                x for x in data.columns if x in ('title', 'description')
            ]]
            Y = data[[x for x in data.columns if x in ('video_category_id')]]
            X_train, X_test, Y_train, Y_test = train_test_split(
                X, Y, train_size=0.80, random_state=10)

            # Build the 2 text corpus
            corpus_title = X_train['title'].values.tolist()
            corpus_description = X_train['description'].values.tolist()

            # initializes the 2 vectorizers.
            count_vectorizer_title = CountVectorizer()
            count_vectorizer_description = CountVectorizer()

            # learn the 2 vocabulary dictionary
            count_vectorizer_title.fit(corpus_title)
            count_vectorizer_description.fit(corpus_description)

            # Build the sparse matrices
            X_train_count_title = count_vectorizer_title.transform(
                X_train['title'])
            X_train_count_description = count_vectorizer_description.transform(
                X_train['description'])
            X_test_count_title = count_vectorizer_title.transform(
                X_test['title'])
            X_test_count_description = count_vectorizer_description.transform(
                X_test['description'])

            # Set and train the models (for title and description features)
            model_count_title = BernoulliNB()
            model_count_description = BernoulliNB()
            model_count_title.fit(X_train_count_title,
                                  Y_train['video_category_id'])
            model_count_description.fit(X_train_count_description,
                                        Y_train['video_category_id'])

            # Merge the title and description predictions and build a new prediction based on these 2 predictions combined
            new_df_train = pd.DataFrame()
            new_df_train['title_prediction'] = model_count_title.predict(
                X_train_count_title)
            new_df_train[
                'description_prediction'] = model_count_description.predict(
                    X_train_count_description)
            new_df_test = pd.DataFrame()
            new_df_test['title_prediction'] = model_count_title.predict(
                X_test_count_title)
            new_df_test[
                'description_prediction'] = model_count_description.predict(
                    X_test_count_description)
            tree = DecisionTreeClassifier()
            tree.fit(new_df_train, Y_train)

            end = time.time()
            execution_time = end - start

            print "Time to build this incredibly amazing model, only : {} seconds!!!!!!".format(
                execution_time)
            time.sleep(3)

            return tree, model_count_title, model_count_description, count_vectorizer_title, count_vectorizer_description

        except:
            raise VideoAnalysisException(
                " Error while creation of predictive model ")