df['subjcat'].value_counts().plot(kind='bar') df['sentcat'].value_counts().plot(kind='bar') df= df[df['sentcat'].isin(['positive','negative'])] # In[26]: #BUILDING THE CLASSIFIERS #ENCODING THE LABELS le = LabelEncoder() filtered["emotion_cat"] = le.fit_transform(labeled["emotions"]) #CONV EN LISTE ET FIT / MAX FEATURES tfidf=TfidfVectorizer() tfidfconverter = TfidfVectorizer(max_features=30000, min_df=7, max_df=0.8, stop_words=stopwords.words('english')) labeled['transformed_tweet']=tfidf.fit_transfrorm(df['filtered']) myset=labeled[['emotions','transformed_tweet']].copy() from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # OUBLIE PAS DE TIME IT from sklearn.metrics import confusion_matrix from sklearn.tree import DecisionTreeRegressor from sklearn.svm import SVC, LinearSVC from sklearn.model_selection import cross_val_score from sklearn.naive_bayes import GaussianNB from sklearn.neighbors import KNeighborsClassifier from sklearn.linear_model import LogisticRegression classifier1 = DecisionTreeRegressor() classifier1.fit(X_train, y_train) y_pred = classifier1.predict(X_test) cm = confusion_matrix(y_test, y_pred)