Example #1
0
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

data = pd.read_csv("x.txt", sep='\t')
data.columns = ['label','body_text']


count_vect = CountVectorizer(analyzer = clearn_text) # clearn_text is a handmade function
X_counts = count_vect.fit_transform(data['body_text'])
print(X_counts.shape)
print(count_vect.get_feature_name())

X_counts_df = pd.DataFrame(X_counts_sample.toarray()) # till now we can see how many times a word appeared in a sentence


# With N-grams ---------------------------------------------------------------------------------
ngram_vect = CountVectorizer(ngram_range=(1,3))
X_counts = ngram_vect.fit_transform(data['body_text'])
print(X_counts.shape)
print(ngram_vect.get_feature_name())

X_counts_df = pd.DataFrame(X_counts_sample.toarray())
X_counts_df.columns = ngram_vect.get_feature_names()

'''
# TF-IDF ----------------------------------------------------------------------------------------
# need to learn more
1st count how many times a word appear in a sentence
2nd count how many sentence including this word too
3rd show the percentage