from nltk import download_shell download_shell()
# In[1]: import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns get_ipython().run_line_magic('matplotlib', 'inline') # In[2]: import nltk # In[ ]: nltk.download_shell() #for downloading stopwords package # In[ ]: #getting the dataset of messages # In[5]: messages = pd.read_csv('SMSSpamCollection', sep='\t', names=['label', 'message']) # In[6]: print(len(messages))
# compares their features. Use Bag of Words and improve it by adjusting word # counts based on their frequency in corpus (the group of all the documents) # with TF-IDF (Term Frequency - Inverse Document Frequency). # Term Frequency TF is the importance of the term within that document # TD(d,t) = num of occurrences of term t in document d # Inverse Document Frequency is the importance of the term in the corpus # IDF(t) = log(D/t) D is total num of docs, t is num of docs with the term # TF-IDF combines the term importance to the document and to all documents # a collection of texts is sometimes called "corpus" # intall nltk library if it is not installed # we will use the stopwords file from this library import nltk # optionally, download the stopwords file to check its content nltk.download_shell() # enter d to inter download mode, or enter l to list out all files # enter stopwords to download the file, then enter q to quit the shell # read sample messages and convert it to a list file = './Documents/Workspace/Python-For-Data-Science/sample_SMSSpamCollection' messages = [line.rstrip() for line in open(file)] # check the number of messages and some specific message content # notice there is a tab seperator (\t) between message class and content print(len(messages)) messages[50] # use enumerate method to number and print the first ten messages # message_no is not part of the file but from the enumerate call for message_no,message in enumerate(messages[:10]):
# -*- coding: utf-8 -*- """ Spyder Editor This is a temporary script file. """ import nltk nltk.download_shell() # download stopwords #get the working directory and set the working directory from os import chdir, getcwd wd = getcwd() wd chdir(wd) #rstrip() plus a list comprehension to get a list of all the lines of text messages messages = [line.rstrip() for line in open('Desktop\SMSSpamCollection')] print(len(messages)) #print the first ten messages and number them using enumerate for message_no, message in enumerate(messages[:10]): print(message_no, message) print('\n') # data is a tab separated file import using pandas import pandas as pd messages = pd.read_csv('Desktop/SMSSpamCollection', sep='\t', names=["label", "message"]) messages.head()