Example #1
0
vocab = [
    word[:][0] for word in vocab
]  # we get vocab in each word in 100 most_common features, [:] is for all row, [0] is for index 0

print(vocab)

# now we get 300 words as vocab and content_final (content that has been cleared)

#this can take some time, this is from sklearn tfidfVectorizer
tfidf = TfidfVectorizer(analyzer='word',
                        stop_words=nltk.corpus.stopwords.words('indonesian'),
                        ngram_range=(1, 1),
                        min_df=0.04,
                        vocabulary=vocab)
tfidf_hasil = tfidf.fit_transform(content_final)
features = tfidf.get_feature_name()
print(features)
print(tfidf_hasil.toarray())

# In[5]:

import numpy
numpy.savetxt('D:/SKRIPSI/percobaan/tfidf1332.csv',
              tfidf_hasil.todense(),
              delimiter=',')

# In[1]:

#df = pd.DataFrame(data = vocab)
#df