def predictVideoCategory(self, tree, model_count_title, model_count_description, count_vectorizer_title, count_vectorizer_description): """ Predict the category of the video based on 'title' and 'description' using the model """ try: # Transform the text features into sparse matrix vect_title = count_vectorizer_title.transform([self.title]) vect_description = count_vectorizer_description.transform( [self.description]) # Perform prediction for 'title' and 'description' title_based_prediction = model_count_title.predict(vect_title) description_based_prediction = model_count_description.predict( vect_description) # Merge 'title' and 'description' predictions and perform final prediction on these. video_df = pd.DataFrame({ 'title_prediction': title_based_prediction, 'description_prediction': description_based_prediction }) category_prediction = tree.predict(video_df) return category_prediction[0] except: raise VideoAnalysisException( " Error while predicting video catagory")
def plotFeatureImportance(data,clf): ''' Plot barchart showing numerical feature importance for predicting video categories Input : classifier Output : bar chart showing the importance of features according to the classifier ''' try: print "Generating the Feature Importance bar chart...\n" time.sleep(3) # Plot predictor_var = ["viewCount", "likeCount", "dislikeCount", "favoriteCount","commentCount", "caption"] fig, ax = plt.subplots(figsize=(15, 15)) width=0.7 ax.bar(np.arange(len(clf.feature_importances_)), clf.feature_importances_, width, color='b') ax.set_xticks(np.arange(len(clf.feature_importances_))) ax.set_xticklabels(predictor_var,rotation=45) plt.title('Numerical Features Importance', fontsize=20) ax.set_ylabel('Normalized Entropy Importance') name = "../YoutubeData/feature_importance.pdf" plt.savefig(name) print "\nPlease close the Bar Chart when you want to move ahead..." plt.show() print "You can always retrieve the Feature Importance bar chart in YoutubeData folder.\n" time.sleep(3) return True except: raise VideoAnalysisException(" Error while ploating Feature Importance Graph ")
def plotGraphicalCorrelationMatrix(data): ''' Input : data Output : graphical correlation matrix Inspired from : https://stanford.edu/~mwaskom/software/seaborn/examples/many_pairwise_correlations.html ''' try: print "\nGenerating the graphical correlation matrix...\n" time.sleep(3) corr = data.corr() f, ax = plt.subplots(figsize=(20, 20)) # Generate a custom diverging colormap cmap = sns.diverging_palette(220, 10, as_cmap=True) # Draw the heatmap with the mask and correct aspect ratio sns.heatmap(corr, cmap=cmap, square=True, xticklabels=False, yticklabels=False, linewidths=.5, cbar_kws={"shrink": .5}, ax=ax) plt.title('Correlation Matrix', fontsize=30) ax.set_ylabel('Features', fontsize=20) ax.set_xlabel('Features', fontsize=20) xticklabels = ['video_category_id','viewCount','likeCount','dislikeCount','favoriteCount','commentCount','dimension','definition','caption','licensedContent'] ylabel = xticklabels[::-1] ax.set_xticklabels(xticklabels, rotation=45) ax.set_yticklabels(ylabel, rotation=0) name = "../YoutubeData/correlation_matrix.pdf" plt.savefig(name) print "\nPlease close the Bar Chart when you want to move ahead..." plt.show() print "You can always retrieve the graphical correlation matrix in YoutubeData folder.\n" time.sleep(3) return True except: raise VideoAnalysisException(" Error while Generating the graphical correlation matrix")
def visualizeData(self, data): """ Display general information on the data and show the 5 first rows """ try: print "\n The data we are using are metadata on YouTube videos..." time.sleep(3) print "\n The metadata we have on the videos are the following: " time.sleep(3) for col in data.columns: print "\n > " + col time.sleep(1.5) print "\n We use these data to build a model able to perform prediction on the category to which each video belong..." time.sleep(3) print "\n The possible categories to which a video can belong are the following (only one category per video): " time.sleep(3) print "\n > Autos & Vehicles, Comedy, Education, Entertainment, Film & Animation, Gaming, Howto & Style, Music, " \ "News & Politics, Nonprofits & Activism, People & Blogs, Pets & Animals, Science & Technology, Sports, Travel & Events " time.sleep(6) print "\n In order to do that, we have a data set of about 240.000 YouTube videos..." time.sleep(3) print "\n Now here is what the first five rows of our data set look like --> ..." time.sleep(3) print "\n (You can always go check the full file : YouTubeData/train_sample.csv ...)" time.sleep(3) print data.head() time.sleep(6) return True except: raise VideoAnalysisException( " Error while Performing Data Visualization ")
def binaryTree(self, data): ''' Building a first exploratory model on non-text data to allow future basic data analytics such as feature importance plottting input : dataframe including the data output : binary tree trained on the data ''' try: # Split Data Set target_var = "video_category_id" predictor_var = [ "viewCount", "likeCount", "dislikeCount", "favoriteCount", "commentCount", "caption" ] X = data[predictor_var] Y = data[target_var] Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, train_size=0.80, random_state=10) # Build model decision_tree = DecisionTreeClassifier(criterion="entropy", min_samples_split=2, min_samples_leaf=128) decision_tree.fit(Xtrain, Ytrain) return decision_tree except: raise VideoAnalysisException( " Error while building the decision tree")
def printVideoCategories(self): """ Print the list of features in the dataset """ try: for key in self.Catagory_mapping: print key, "-->", self.Catagory_mapping[key] except: raise VideoAnalysisException(" Error while printing video catagories ")
def __init__(self, title, description): """ Video Constructor: initialize all the video's features. """ try: self.title = title self.description = description except: raise VideoAnalysisException( " Error while initializing Video class instance ")
def __init__(self): """ Initialize the program prompting the instruction to the program """ try: self.printProgramInfos() self.datamanager = DataManager() self.dataexplorer = DataExplorer() except: raise VideoAnalysisException(" Error while initializing FlowManager Instance ")
def printFeatures(self): """ Print the list of the existing YouTube categories, in order to allow the user to chose the one on which he wants to perform analysis """ try: for key in self.features: print key, "-->", self.features[key] except: raise VideoAnalysisException(" Error while printing different features of the data ")
def load_data(self): ''' Loading data into pandas dataframe ''' try: dataframe = pd.read_csv('../YouTubeData/train_sample.csv') df_reindex = dataframe.reindex(np.random.permutation(dataframe.index)) return df_reindex except: raise VideoAnalysisException(" Error while loading the data ")
def generalAnalysis(self, data, clf): """ Perform general analysis """ try: dataplotter.plotFeatureImportance(data, clf) dataplotter.plotNumericalCorrelationMatrix(data) dataplotter.plotGraphicalCorrelationMatrix(data) return True except: raise VideoAnalysisException(" Error while performing general analysis ")
def featuresBarPlot(barNames, barValues): """ Display a bar plot with the barValues as the length of the bars and barNames as the name of the bars. Input: name of the bars, value of the bars Output: bar plot """ try: plt.bar(range(0, len(barNames)), barValues) plt.xticks(range(0, len(barNames)), barNames, rotation='vertical') plt.show() except: raise VideoAnalysisException( " Error while ploating features Bar Plot ")
def plotNumericalCorrelationMatrix(data): ''' Input : data Output : numerical correlation matrix ''' try: print "Displaying numerical correlation matrix...\n" time.sleep(3) corr = data.corr() print corr time.sleep(3) return True except: raise VideoAnalysisException(" Error while Displaying numerical correlation matrix ")
def individual_feature_analysis(self, data, chosenFeature): """ Compute a group by on the chosenFeature and call featuresBarPlot to plot the result """ try: print "chosen feature: ", self.features[chosenFeature] featuresMeans = data.groupby(['video_category_id'])[self.features[chosenFeature]].mean() featuresNames = [self.Catagory_mapping[x] for x in featuresMeans.index] name = "../YoutubeData/FeatureBarChart.pdf" plt.savefig(name) print "\nPlease close the Bar Chart when you want to move ahead..." dataplotter.featuresBarPlot(featuresNames, featuresMeans.values) print "You can always retrieve the Feature Importance bar chart in YoutubeData folder.\n" time.sleep(3) return True except: raise VideoAnalysisException(" Error while performing individual feature analysis ")
def clean_data(self, data): ''' remove unwanted colums, convert catagorical data into numeric data dimension : 2d -> 1 , 3d -> 0 definition :: hd -> 1 , sd -> 0 caption : True -> 1 , False -> 0 licensedContent : True -> 1 , False -> 0 ''' try: data['dimension'] = (data['dimension'] == '2d') * 1 data['definition'] = (data['definition'] == 'hd') * 1 data['caption'] = (data['caption']) * 1 data['licensedContent'] = (data['licensedContent']) * 1 # Replace the NaN values in 'description' data['description'].fillna('No description', inplace=True) return data except: raise VideoAnalysisException(" Error while cleaning the data ")
def individual_videocatagory_analysis(self, data, video_id): """ Input : cleaned data set, video id Output : Analysis of data in form of matrices and numerical values for chosen Video Catagory This function takes the Video_catagory_id from the user, and analyze the data for that Video Catagory. """ try: video_id = int(video_id) VideoCatagory = self.Catagory_mapping[video_id] dataframe = data count_features = [ "viewCount", "likeCount", "dislikeCount", "favoriteCount", "commentCount" ] time.sleep(2) print "\n===============" print "Analysis for video catagory = ", VideoCatagory print "===============\n" VideoCatagoryData = dataframe[dataframe["video_category_id"] == video_id] VideoCatagory_CountData = VideoCatagoryData[count_features] description = VideoCatagoryData.describe() correlation = VideoCatagory_CountData.corr(method='pearson', min_periods=1) time.sleep(2) print "\ndescription of each feature - \n" time.sleep(3) print description time.sleep(3) print "\ncorrelation within count features - \n" time.sleep(3) print correlation time.sleep(3) print "\nAnalysis based on count for Binary features :------> \n" time.sleep(3) print "\nDimension analysis: 2d(1) , 3d(0)\n" time.sleep(2) dimensionCount = VideoCatagoryData.groupby( 'dimension')['video_category_id'].count() print dimensionCount time.sleep(2) print "\nWe have ", dimensionCount[ 0], " 3D videos and ", dimensionCount[ 1], " 2D videos for catagory ", VideoCatagory time.sleep(4) print "\ndefinition analysis: hd(1) , sd(0)\n" time.sleep(2) DefinitionCount = VideoCatagoryData.groupby( 'definition')['video_category_id'].count() print DefinitionCount time.sleep(2) print "\nWe have ", DefinitionCount[ 0], " SD videos and ", DefinitionCount[ 1], " HD videos for catagory ", VideoCatagory time.sleep(4) print "\ncaption : TRUE(1) , FALSE(0)\n" time.sleep(2) CaptionCount = VideoCatagoryData.groupby( 'caption')['video_category_id'].count() print CaptionCount time.sleep(2) print "\nWe have ", CaptionCount[ 1], " videos with the caption and ", CaptionCount[ 0], " videos without the caption for catagory ", VideoCatagory time.sleep(4) print "\ncaption analysis: TRUE(1) , FALSE(0)\n" time.sleep(2) licenseCount = VideoCatagoryData.groupby( 'licensedContent')['video_category_id'].count() print licenseCount time.sleep(2) print "\nWe have ", licenseCount[ 1], " videos with the License and ", CaptionCount[ 0], " videos without the License for catagory ", VideoCatagory time.sleep(4) return True except: raise VideoAnalysisException( " Error while analyzing data for individual video catagory ")
def generatePredictingModel(data): """ Build the prediction model (based on the data set we have) in order to be able to predict the category of a new video from the user input Return a classifier able to predict the category of a video based on its title and description. """ try: # Intitialize a timer to compute the time to build the model start = time.time() # Split into train-test data set X = data[[ x for x in data.columns if x in ('title', 'description') ]] Y = data[[x for x in data.columns if x in ('video_category_id')]] X_train, X_test, Y_train, Y_test = train_test_split( X, Y, train_size=0.80, random_state=10) # Build the 2 text corpus corpus_title = X_train['title'].values.tolist() corpus_description = X_train['description'].values.tolist() # initializes the 2 vectorizers. count_vectorizer_title = CountVectorizer() count_vectorizer_description = CountVectorizer() # learn the 2 vocabulary dictionary count_vectorizer_title.fit(corpus_title) count_vectorizer_description.fit(corpus_description) # Build the sparse matrices X_train_count_title = count_vectorizer_title.transform( X_train['title']) X_train_count_description = count_vectorizer_description.transform( X_train['description']) X_test_count_title = count_vectorizer_title.transform( X_test['title']) X_test_count_description = count_vectorizer_description.transform( X_test['description']) # Set and train the models (for title and description features) model_count_title = BernoulliNB() model_count_description = BernoulliNB() model_count_title.fit(X_train_count_title, Y_train['video_category_id']) model_count_description.fit(X_train_count_description, Y_train['video_category_id']) # Merge the title and description predictions and build a new prediction based on these 2 predictions combined new_df_train = pd.DataFrame() new_df_train['title_prediction'] = model_count_title.predict( X_train_count_title) new_df_train[ 'description_prediction'] = model_count_description.predict( X_train_count_description) new_df_test = pd.DataFrame() new_df_test['title_prediction'] = model_count_title.predict( X_test_count_title) new_df_test[ 'description_prediction'] = model_count_description.predict( X_test_count_description) tree = DecisionTreeClassifier() tree.fit(new_df_train, Y_train) end = time.time() execution_time = end - start print "Time to build this incredibly amazing model, only : {} seconds!!!!!!".format( execution_time) time.sleep(3) return tree, model_count_title, model_count_description, count_vectorizer_title, count_vectorizer_description except: raise VideoAnalysisException( " Error while creation of predictive model ")