# Create clean_train_reviews and clean_test_reviews as we did before
    #

    # Read data from files
    train = pd.read_csv(data_path + 'labeledTrainData.tsv', header = 0, \
                        delimiter = '\t', quoting = 3)
    test = pd.read_csv(data_path + 'testData.tsv', header = 0, \
                        delimiter = '\t', quoting = 3)
    unlabeled_train = pd.read_csv(data_path + 'unlabeledTrainData.tsv', header = 0, \
                        delimiter = '\t', quoting = 3)


    print "Cleaning training reviews"
    clean_train_reviews = []
    for review in train["review"]:
        clean_train_reviews.append( pre.review_to_wordlist( review, \
            remove_stopwords=True ))

    print "Cleaning test reviews"
    clean_test_reviews = []
    for review in test["review"]:
        clean_test_reviews.append( pre.review_to_wordlist( review, \
            remove_stopwords=True ))


    # ****** Create bags of centroids
    #
    # Pre-allocate an array for the training set bags of centroids (for speed)
    train_centroids = np.zeros( (train["review"].size, num_clusters), \
        dtype="float32" )

    # Transform the training set reviews into bags of centroids
df = pd.read_csv(data_path + 'labeledTrainData.tsv', header = 0, \
                    delimiter = '\t', quoting = 3)
num_docus = train['review'].size


#1. remove the HTML markup( like <br>), remove non-letters, convert to lower case, split into 
# words, remove stopwords, join words back into one string separated by space

import Preprocessing_nlp as pre
clean_docus = []
for i in xrange(0, num_docus):
    
    if ((i+1)%1000 == 0):
        print "review %d of %d\n" % (i+1, num_docus)
    
    clean_docus.append(pre.review_to_words(df['clean_url'][i], filter_words = 'timeline'))

# 1.2 further filtering more words (optional)


    
#####################################################
#2.1    create features from the bag of words
print 'creating the bag of words...\n'
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer( analyzer = 'word', \
                            tokenizer = None, \
                            preprocessor = None, \
                            stop_words = None, \
                            max_features = 5000)