df = pd.read_csv(data_path + 'labeledTrainData.tsv', header = 0, \
                    delimiter = '\t', quoting = 3)
num_docus = train['review'].size


#1. remove the HTML markup( like <br>), remove non-letters, convert to lower case, split into 
# words, remove stopwords, join words back into one string separated by space

import Preprocessing_nlp as pre
clean_docus = []
for i in xrange(0, num_docus):
    
    if ((i+1)%1000 == 0):
        print "review %d of %d\n" % (i+1, num_docus)
    
    clean_docus.append(pre.review_to_words(df['clean_url'][i], filter_words = 'timeline'))

# 1.2 further filtering more words (optional)


    
#####################################################
#2.1    create features from the bag of words
print 'creating the bag of words...\n'
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer( analyzer = 'word', \
                            tokenizer = None, \
                            preprocessor = None, \
                            stop_words = None, \
                            max_features = 5000)