def plot_most_frequent_tokens(self, df, column_name): count_vectorizer = CountVectorizer() tf_original = count_vectorizer.fit_transform(df[column_name]) tf_feature_names = count_vectorizer.get_feature_names() visualizer = FreqDistVisualizer(features=tf_feature_names, orient='v') visualizer.fit(tf_original) visualizer.show()
def bagOfWords(featureTrain, stopWords=False, countWords=False, plot=False): if (stopWords == False): count_vect = CountVectorizer() else: count_vect = CountVectorizer(stop_words='english') X_train_counts = count_vect.fit_transform(featureTrain.headline) if countWords: features = count_vect.get_feature_names() visualizer = FreqDistVisualizer(features=features, n=20, orient='v') visualizer.fit(X_train_counts) words = countTopWords(X_train_counts, count_vect, 20) if stopWords: visualizer.show(outpath="SWRemovedYB") visualizer.show() plotBagOfWords("Stop Words Removed", words, 20, stopWords) else: visualizer.show(outpath="SWIncludedYB") visualizer.show() plotBagOfWords("Stop Words Included", words, 20, stopWords) return count_vect, X_train_counts
print(x_train.shape) print(x_test.shape) print('\n') print("________________________________-Text preparation___________________") #Converting our NLP text into vector by using the function countvectorizer print("_____________________Contvervectorizer___________________________") con_vec = CountVectorizer(stop_words=stopwords.words('english')) x_train_count= con_vec.fit_transform(x_train) #print(x_train_count) #Token Frequency Distribution feature =con_vec.get_feature_names() visualizer =FreqDistVisualizer(features=feature,orient='v') visualizer.fit(x_train_count) visualizer.show() #Compute of word count with the function tfidtransformer print("---------------------TfdiTransformer------------------------------------------------------") tfidftransformer = TfidfTransformer() x_train_tfidf =tfidftransformer.fit_transform(x_train_count) print(x_train_tfidf.shape) #Transforming text into a meaningful representation of numbers with of function TfidfVectorize print("-------------------------TfidfVectorize---------------------------") vectorizer = TfidfVectorizer() x_train_tfidf =vectorizer.fit_transform(x_train) print(x_train_tfidf) print('\n') #We used 3 algorithms to test our model: SVM model, Random Forest and Decision Tree
df[new_text_field_name] = df[new_text_field_name].apply( lambda elem: re.sub(r"\d+", "", elem)) return df data_clean = clean_text(train_data, 'text', 'text') # Removes stop words data_clean['text'] = data_clean['text'].apply( lambda x: ' '.join([word for word in x.split() if word not in (stop)])) vectorizer = CountVectorizer() docs = vectorizer.fit_transform(data_clean['text']) features = vectorizer.get_feature_names() visualizer = FreqDistVisualizer(features=features, orient='v') visualizer.fit(docs) visualizer.show() disaster_tweets = data_clean[data_clean['target'] == 1] vectorizer = CountVectorizer() docs = vectorizer.fit_transform(disaster_tweets['text']) features_disaster = vectorizer.get_feature_names() visualizer_disaster = FreqDistVisualizer(features=features_disaster, orient='v') visualizer_disaster.fit(docs) visualizer_disaster.show() ###Logistic regression X_train, X_test, y_train, y_test = train_test_split(data_clean['text'], data_clean['target'], random_state=0)